diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 368a3c56c900..8174a8e850c8 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1,1903 +1,1903 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_pci.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); /* * The PTI trampoline stack needs enough space for a hardware trapframe and a * couple of scratch registers, as well as the trapframe left behind after an * iret fault. */ CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - offsetof(struct pti_frame, pti_rip)); extern u_int64_t hammer_time(u_int64_t, u_int64_t); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Probe 8254 PIT and TSC. */ static void native_clock_source_init(void); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = native_clock_source_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, }; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; int late_console = 1; struct kva_md_info kmi; struct region_descriptor r_idt; struct pcpu *__pcpu; struct pcpu temp_bsp_pcpu; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); bool efi_boot; static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); #ifdef DEV_PCI if (bootverbose && intel_graphics_stolen_base != 0) printf("intel stolen mem: base %#jx size %ju MB\n", (uintmax_t)intel_graphics_stolen_base, (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); #endif /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } static void late_ifunc_resolve(void *dummy __unused) { link_elf_late_ireloc(); } SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); static char mce0_stack[MCE_STACK_SIZE] __aligned(16); static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(div_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32), IDTVEC(fast_syscall_pti); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ -DB_SHOW_COMMAND(idt, db_show_idt) +DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ -DB_SHOW_COMMAND(sysregs, db_show_sysregs) +DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } -DB_SHOW_COMMAND(dbregs, db_show_dbregs) +DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); - db_printf("dr7\t0x%016lx\n", rdr7()); + db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %012lx %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, physmap, physmap_idx)) break; } } static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end, 0); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void native_clock_source_init(void) { i8254_init(); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* Set up the fast syscall stuff */ void amd64_conf_fast_syscall(void) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); } void amd64_bsp_pcpu_init1(struct pcpu *pc) { struct user_segment_descriptor *gdt; PCPU_SET(prvspace, pc); gdt = *PCPU_PTR(gdt); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, PCPU_PTR(common_tss)); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); PCPU_SET(smp_tlb_gen, 1); } void amd64_bsp_pcpu_init2(uint64_t rsp0) { PCPU_SET(rsp0, rsp0); PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); PCPU_SET(curpcb, thread0.td_pcb); } void amd64_bsp_ist_init(struct pcpu *pc) { struct nmi_pcpu *np; struct amd64tss *tssp; tssp = &pc->pc_common_tss; /* doublefault stack space, runs on ist1 */ np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist1 = (long)np; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist2 = (long)np; /* * MC# stack, runs on ist3. The pcpu pointer is stored just * above the start of the ist3 stack. */ np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist3 = (long)np; /* * DB# stack, runs on ist4. */ np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist4 = (long)np; } /* * Calculate the kernel load address by inspecting page table created by loader. * The assumptions: * - kernel is mapped at KERNBASE, backed by contiguous phys memory * aligned at 2M, below 4G (the latter is important for AP startup) * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) * - kernel is mapped with 2M superpages * - all participating memory, i.e. kernel, modules, metadata, * page table is accessible by pre-created 1:1 mapping * (right now loader creates 1:1 mapping for lower 4G, and all * memory is from there) * - there is a usable memory block right after the end of the * mapped kernel and all modules/metadata, pointed to by * physfree, for early allocations */ vm_paddr_t __nosanitizeaddress __nosanitizememory amd64_loadaddr(void) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; uint64_t cr3; cr3 = rcr3(); pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART); pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART); pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART); return (*pde & PG_FRAME); } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; uint64_t rsp0; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; size_t kstack0_sz; TSRAW(&thread0, TS_ENTER, __func__, NULL); kernphys = amd64_loadaddr(); physfree += kernphys; kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL; if (!efi_boot) { /* Tell the bios to warmboot next time */ atomic_store_short((u_short *)0x472, 0x1234); } physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); physfree = roundup2(physfree, PAGE_SIZE); identify_cpu1(); identify_hypervisor(); identify_cpu_fixup_bsp(); identify_cpu2(); initializecpucache(); /* * Check for pti, pcid, and invpcid before ifuncs are * resolved, to correctly select the implementation for * pmap_activate_sw_mode(). */ pti = pti_get_default(); TUNABLE_INT_FETCH("vm.pmap.pti", &pti); TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; } else { pmap_pcid_enabled = 0; } link_elf_ireloc(kmdp); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree - kernphys + KERNSTART; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * Initialize enough of thread0 for delayed invalidation to * work very early. Rely on thread0.td_base_pri * zero-initialization, it is reset to PVM at proc0_init(). */ pmap_thread_init_invl_gen(&thread0); pc = &temp_bsp_pcpu; pcpu_init(pc, 0, sizeof(struct pcpu)); gdt = &temp_bsp_pcpu.pc_gdt[0]; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long)gdt; lgdt(&r_gdt); wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (efi_boot) vty_set_preferred(VTY_VT); TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", &syscall_ret_l1d_flush_mode); TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", &x86_rngds_mitg_enable); finishidentcpu(); /* Final stage of CPU initialization */ /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); initializecpu(); /* Initialize CPU registers */ amd64_bsp_ist_init(pc); /* Set the IO permission bitmap (empty due to tss seg limit) */ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); amd64_conf_fast_syscall(); /* * We initialize the PCB pointer early so that exception * handlers will work. Also set up td_critnest to short-cut * the page fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); set_top_of_stack_td(&thread0); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ #ifdef DEV_PCI /* This call might adjust phys_avail[]. */ pci_early_quirks(); #endif if (late_console) cninit(); /* * Dump the boot metadata. We have to wait for cninit() since console * output is required. If it's grossly incorrect the kernel will never * make it this far. */ if (getenv_is_true("debug.dump_modinfo_at_boot")) preload_dump(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?" #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = thread0.td_md.md_stack_base; /* Ensure the stack is aligned to 16 bytes */ rsp0 &= ~0xFul; PCPU_PTR(common_tss)->tss_rsp0 = rsp0; amd64_bsp_pcpu_init2(rsp0); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); kcsan_cpu_init(0); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; kasan_init(); kmsan_init(); TSEXIT(); /* Location of kernel stack for locore */ return (thread0.td_md.md_stack_base); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ static void set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) { return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? set_pcb_flags_fsgsbase : set_pcb_flags_raw); } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ #undef memset #undef memmove #undef memcpy void *memset_std(void *buf, int c, size_t len); void *memset_erms(void *buf, int c, size_t len); void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); #ifdef KCSAN /* * These fail to build as ifuncs when used with KCSAN. */ void * memset(void *buf, int c, size_t len) { return (memset_std(buf, c, len)); } void * memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memmove_std(dst, src, len)); } void * memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memcpy_std(dst, src, len)); } #else DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memset_erms : memset_std); } DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memmove_erms : memmove_std); } DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } #endif void pagezero_std(void *addr); void pagezero_erms(void *addr); DEFINE_IFUNC(, void , pagezero, (void *)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? pagezero_erms : pagezero_std); } diff --git a/sys/ddb/db_command.c b/sys/ddb/db_command.c index e6de6cd716e2..06eab31dbfa8 100644 --- a/sys/ddb/db_command.c +++ b/sys/ddb/db_command.c @@ -1,922 +1,922 @@ /*- * SPDX-License-Identifier: MIT-CMU * * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Author: David B. Golub, Carnegie Mellon University * Date: 7/90 */ /* * Command dispatcher. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Exported global variables */ int db_cmd_loop_done; db_addr_t db_dot; db_addr_t db_last_addr; db_addr_t db_prev; db_addr_t db_next; static db_cmdfcn_t db_dump; static db_cmdfcn_t db_fncall; static db_cmdfcn_t db_gdb; static db_cmdfcn_t db_halt; static db_cmdfcn_t db_kill; static db_cmdfcn_t db_reset; static db_cmdfcn_t db_stack_trace; static db_cmdfcn_t db_stack_trace_active; static db_cmdfcn_t db_stack_trace_all; static db_cmdfcn_t db_watchdog; #define DB_CMD(_name, _func, _flags) \ { \ .name = (_name), \ .fcn = (_func), \ .flag = (_flags), \ .more = NULL, \ } #define DB_TABLE(_name, _more) \ { \ .name = (_name), \ .fcn = NULL, \ .more = (_more), \ } static struct db_command db_show_active_cmds[] = { DB_CMD("trace", db_stack_trace_active, 0), }; struct db_command_table db_show_active_table = LIST_HEAD_INITIALIZER(db_show_active_table); static struct db_command db_show_all_cmds[] = { DB_CMD("trace", db_stack_trace_all, 0), }; struct db_command_table db_show_all_table = LIST_HEAD_INITIALIZER(db_show_all_table); static struct db_command db_show_cmds[] = { DB_TABLE("active", &db_show_active_table), DB_TABLE("all", &db_show_all_table), DB_CMD("registers", db_show_regs, 0), DB_CMD("breaks", db_listbreak_cmd, 0), DB_CMD("threads", db_show_threads, 0), }; struct db_command_table db_show_table = LIST_HEAD_INITIALIZER(db_show_table); static struct db_command db_cmds[] = { DB_TABLE("show", &db_show_table), DB_CMD("print", db_print_cmd, 0), DB_CMD("p", db_print_cmd, 0), DB_CMD("examine", db_examine_cmd, CS_SET_DOT), DB_CMD("x", db_examine_cmd, CS_SET_DOT), DB_CMD("search", db_search_cmd, CS_OWN|CS_SET_DOT), DB_CMD("set", db_set_cmd, CS_OWN), DB_CMD("write", db_write_cmd, CS_MORE|CS_SET_DOT), DB_CMD("w", db_write_cmd, CS_MORE|CS_SET_DOT), DB_CMD("delete", db_delete_cmd, 0), DB_CMD("d", db_delete_cmd, 0), DB_CMD("dump", db_dump, 0), DB_CMD("break", db_breakpoint_cmd, 0), DB_CMD("b", db_breakpoint_cmd, 0), DB_CMD("dwatch", db_deletewatch_cmd, 0), DB_CMD("watch", db_watchpoint_cmd, CS_MORE), DB_CMD("dhwatch", db_deletehwatch_cmd, 0), DB_CMD("hwatch", db_hwatchpoint_cmd, 0), DB_CMD("step", db_single_step_cmd, 0), DB_CMD("s", db_single_step_cmd, 0), DB_CMD("continue", db_continue_cmd, 0), DB_CMD("c", db_continue_cmd, 0), DB_CMD("until", db_trace_until_call_cmd, 0), DB_CMD("next", db_trace_until_matching_cmd, 0), DB_CMD("match", db_trace_until_matching_cmd, 0), DB_CMD("trace", db_stack_trace, CS_OWN), DB_CMD("t", db_stack_trace, CS_OWN), /* XXX alias for active trace */ DB_CMD("acttrace", db_stack_trace_active, 0), /* XXX alias for all trace */ DB_CMD("alltrace", db_stack_trace_all, 0), DB_CMD("where", db_stack_trace, CS_OWN), DB_CMD("bt", db_stack_trace, CS_OWN), DB_CMD("call", db_fncall, CS_OWN), DB_CMD("ps", db_ps, 0), DB_CMD("gdb", db_gdb, 0), DB_CMD("halt", db_halt, 0), DB_CMD("reboot", db_reset, 0), DB_CMD("reset", db_reset, 0), DB_CMD("kill", db_kill, CS_OWN), DB_CMD("watchdog", db_watchdog, CS_OWN), DB_CMD("thread", db_set_thread, 0), DB_CMD("run", db_run_cmd, CS_OWN), DB_CMD("script", db_script_cmd, CS_OWN), DB_CMD("scripts", db_scripts_cmd, 0), DB_CMD("unscript", db_unscript_cmd, CS_OWN), DB_CMD("capture", db_capture_cmd, CS_OWN), DB_CMD("textdump", db_textdump_cmd, CS_OWN), DB_CMD("findstack", db_findstack_cmd, 0), }; struct db_command_table db_cmd_table = LIST_HEAD_INITIALIZER(db_cmd_table); #undef DB_CMD #undef DB_TABLE static struct db_command *db_last_command = NULL; /* * if 'ed' style: 'dot' is set at start of last item printed, * and '+' points to next line. * Otherwise: 'dot' points to next item, '..' points to last. */ static bool db_ed_style = true; /* * Utility routine - discard tokens through end-of-line. */ void db_skip_to_eol(void) { int t; do { t = db_read_token(); } while (t != tEOL); } /* * Results of command search. */ #define CMD_UNIQUE 0 #define CMD_FOUND 1 #define CMD_NONE 2 #define CMD_AMBIGUOUS 3 #define CMD_HELP 4 static void db_cmd_match(char *name, struct db_command *cmd, struct db_command **cmdp, int *resultp); static void db_cmd_list(struct db_command_table *table); static int db_cmd_search(char *name, struct db_command_table *table, struct db_command **cmdp); static void db_command(struct db_command **last_cmdp, struct db_command_table *cmd_table, bool dopager); /* * Initialize the command lists from the static tables. */ void db_command_init(void) { int i; for (i = 0; i < nitems(db_cmds); i++) db_command_register(&db_cmd_table, &db_cmds[i]); for (i = 0; i < nitems(db_show_cmds); i++) db_command_register(&db_show_table, &db_show_cmds[i]); for (i = 0; i < nitems(db_show_active_cmds); i++) db_command_register(&db_show_active_table, &db_show_active_cmds[i]); for (i = 0; i < nitems(db_show_all_cmds); i++) db_command_register(&db_show_all_table, &db_show_all_cmds[i]); } /* * Register a command. */ void db_command_register(struct db_command_table *list, struct db_command *cmd) { struct db_command *c, *last; last = NULL; LIST_FOREACH(c, list, next) { int n = strcmp(cmd->name, c->name); /* Check that the command is not already present. */ if (n == 0) { printf("%s: Warning, the command \"%s\" already exists;" " ignoring request\n", __func__, cmd->name); return; } if (n < 0) { /* NB: keep list sorted lexicographically */ LIST_INSERT_BEFORE(c, cmd, next); return; } last = c; } if (last == NULL) LIST_INSERT_HEAD(list, cmd, next); else LIST_INSERT_AFTER(last, cmd, next); } /* * Remove a command previously registered with db_command_register. */ void db_command_unregister(struct db_command_table *list, struct db_command *cmd) { struct db_command *c; LIST_FOREACH(c, list, next) { if (cmd == c) { LIST_REMOVE(cmd, next); return; } } /* NB: intentionally quiet */ } /* * Helper function to match a single command. */ static void db_cmd_match(char *name, struct db_command *cmd, struct db_command **cmdp, int *resultp) { char *lp, *rp; int c; lp = name; rp = cmd->name; while ((c = *lp) == *rp) { if (c == 0) { /* complete match */ *cmdp = cmd; *resultp = CMD_UNIQUE; return; } lp++; rp++; } if (c == 0) { /* end of name, not end of command - partial match */ if (*resultp == CMD_FOUND) { *resultp = CMD_AMBIGUOUS; /* but keep looking for a full match - this lets us match single letters */ } else if (*resultp == CMD_NONE) { *cmdp = cmd; *resultp = CMD_FOUND; } } } /* * Search for command prefix. */ static int db_cmd_search(char *name, struct db_command_table *table, struct db_command **cmdp) { struct db_command *cmd; int result = CMD_NONE; LIST_FOREACH(cmd, table, next) { db_cmd_match(name,cmd,cmdp,&result); if (result == CMD_UNIQUE) break; } if (result == CMD_NONE) { /* check for 'help' */ if (name[0] == 'h' && name[1] == 'e' && name[2] == 'l' && name[3] == 'p') result = CMD_HELP; } return (result); } static void db_cmd_list(struct db_command_table *table) { struct db_command *cmd; int have_subcommands; have_subcommands = 0; LIST_FOREACH(cmd, table, next) { if (cmd->more != NULL) have_subcommands++; db_printf("%-16s", cmd->name); db_end_line(16); } if (have_subcommands > 0) { db_printf("\nThe following have subcommands; append \"help\" " "to list (e.g. \"show help\"):\n"); LIST_FOREACH(cmd, table, next) { if (cmd->more == NULL) continue; db_printf("%-16s", cmd->name); db_end_line(16); } } } static void db_command(struct db_command **last_cmdp, struct db_command_table *cmd_table, bool dopager) { char modif[TOK_STRING_SIZE]; struct db_command *cmd = NULL; db_expr_t addr, count; int t, result; bool have_addr = false; t = db_read_token(); if (t == tEOL) { /* empty line repeats last command, at 'next' */ cmd = *last_cmdp; addr = (db_expr_t)db_next; have_addr = false; count = 1; modif[0] = '\0'; } else if (t == tEXCL) { db_fncall((db_expr_t)0, false, (db_expr_t)0, NULL); return; } else if (t != tIDENT) { db_printf("Unrecognized input; use \"help\" " "to list available commands\n"); db_flush_lex(); return; } else { /* * Search for command */ while (cmd_table != NULL) { result = db_cmd_search(db_tok_string, cmd_table, &cmd); switch (result) { case CMD_NONE: db_printf("No such command; use \"help\" " "to list available commands\n"); db_flush_lex(); return; case CMD_AMBIGUOUS: db_printf("Ambiguous\n"); db_flush_lex(); return; case CMD_HELP: if (cmd_table == &db_cmd_table) { db_printf("This is ddb(4), the kernel debugger; " "see https://man.FreeBSD.org/ddb/4 for help.\n"); db_printf("Use \"bt\" for backtrace, \"dump\" for " "kernel core dump, \"reset\" to reboot.\n"); db_printf("Available commands:\n"); } db_cmd_list(cmd_table); db_flush_lex(); return; case CMD_UNIQUE: case CMD_FOUND: break; } if ((cmd_table = cmd->more) != NULL) { t = db_read_token(); if (t != tIDENT) { db_printf("Subcommand required; " "available subcommands:\n"); db_cmd_list(cmd_table); db_flush_lex(); return; } } } if ((cmd->flag & CS_OWN) == 0) { /* * Standard syntax: * command [/modifier] [addr] [,count] */ t = db_read_token(); if (t == tSLASH) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); db_flush_lex(); return; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } if (db_expression(&addr)) { db_dot = (db_addr_t) addr; db_last_addr = db_dot; have_addr = true; } else { addr = (db_expr_t) db_dot; have_addr = false; } t = db_read_token(); if (t == tCOMMA) { if (!db_expression(&count)) { db_printf("Count missing\n"); db_flush_lex(); return; } } else { db_unread_token(t); count = -1; } if ((cmd->flag & CS_MORE) == 0) { db_skip_to_eol(); } } } *last_cmdp = cmd; if (cmd != NULL) { /* * Execute the command. */ if (dopager) db_enable_pager(); else db_disable_pager(); (*cmd->fcn)(addr, have_addr, count, modif); if (dopager) db_disable_pager(); if (cmd->flag & CS_SET_DOT) { /* * If command changes dot, set dot to previous address * displayed (if 'ed' style). */ db_dot = db_ed_style ? db_prev : db_next; } else { /* * If command does not change dot, set 'next' location * to be the same. */ db_next = db_dot; } } } /* * At least one non-optional command must be implemented using * DB_COMMAND() so that db_cmd_set gets created. Here is one. */ -DB_COMMAND(panic, db_panic) +DB_COMMAND_FLAGS(panic, db_panic, DB_CMD_MEMSAFE) { db_disable_pager(); panic("from debugger"); } void db_command_loop(void) { /* * Initialize 'prev' and 'next' to dot. */ db_prev = db_dot; db_next = db_dot; db_cmd_loop_done = 0; while (!db_cmd_loop_done) { if (db_print_position() != 0) db_printf("\n"); db_printf("db> "); (void)db_read_line(); db_command(&db_last_command, &db_cmd_table, /* dopager */ true); } } /* * Execute a command on behalf of a script. The caller is responsible for * making sure that the command string is < DB_MAXLINE or it will be * truncated. * * XXXRW: Runs by injecting faked input into DDB input stream; it would be * nicer to use an alternative approach that didn't mess with the previous * command buffer. */ void db_command_script(const char *command) { db_prev = db_next = db_dot; db_inject_line(command); db_command(&db_last_command, &db_cmd_table, /* dopager */ false); } void db_error(const char *s) { if (s) db_printf("%s", s); db_flush_lex(); kdb_reenter_silent(); } static void db_dump(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { int error; if (textdump_pending) { db_printf("textdump_pending set.\n" "run \"textdump unset\" first or \"textdump dump\" for a textdump.\n"); return; } error = doadump(false); if (error) { db_printf("Cannot dump: "); switch (error) { case EBUSY: db_printf("debugger got invoked while dumping.\n"); break; case ENXIO: db_printf("no dump device specified.\n"); break; default: db_printf("unknown error (error=%d).\n", error); break; } } } /* * Call random function: * !expr(arg,arg,arg) */ /* The generic implementation supports a maximum of 10 arguments. */ typedef db_expr_t __db_f(db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t); static __inline int db_fncall_generic(db_expr_t addr, db_expr_t *rv, int nargs, db_expr_t args[]) { __db_f *f = (__db_f *)addr; if (nargs > 10) { db_printf("Too many arguments (max 10)\n"); return (0); } *rv = (*f)(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]); return (1); } static void db_fncall(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t fn_addr; db_expr_t args[DB_MAXARGS]; int nargs = 0; db_expr_t retval; int t; if (!db_expression(&fn_addr)) { db_printf("Bad function\n"); db_flush_lex(); return; } t = db_read_token(); if (t == tLPAREN) { if (db_expression(&args[0])) { nargs++; while ((t = db_read_token()) == tCOMMA) { if (nargs == DB_MAXARGS) { db_printf("Too many arguments (max %d)\n", DB_MAXARGS); db_flush_lex(); return; } if (!db_expression(&args[nargs])) { db_printf("Argument missing\n"); db_flush_lex(); return; } nargs++; } db_unread_token(t); } if (db_read_token() != tRPAREN) { db_printf("Mismatched parens\n"); db_flush_lex(); return; } } db_skip_to_eol(); db_disable_pager(); if (DB_CALL(fn_addr, &retval, nargs, args)) db_printf("= %#lr\n", (long)retval); } static void db_halt(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { cpu_halt(); } static void db_kill(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t old_radix, pid, sig; struct proc *p; #define DB_ERROR(f) do { db_printf f; db_flush_lex(); goto out; } while (0) /* * PIDs and signal numbers are typically represented in base * 10, so make that the default here. It can, of course, be * overridden by specifying a prefix. */ old_radix = db_radix; db_radix = 10; /* Retrieve arguments. */ if (!db_expression(&sig)) DB_ERROR(("Missing signal number\n")); if (!db_expression(&pid)) DB_ERROR(("Missing process ID\n")); db_skip_to_eol(); if (!_SIG_VALID(sig)) DB_ERROR(("Signal number out of range\n")); /* * Find the process in question. allproc_lock is not needed * since we're in DDB. */ /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) if (p->p_pid == pid) break; /* sx_sunlock(&allproc_lock); */ if (p == NULL) DB_ERROR(("Can't find process with pid %ld\n", (long) pid)); /* If it's already locked, bail; otherwise, do the deed. */ if (PROC_TRYLOCK(p) == 0) DB_ERROR(("Can't lock process with pid %ld\n", (long) pid)); else { pksignal(p, sig, NULL); PROC_UNLOCK(p); } out: db_radix = old_radix; #undef DB_ERROR } /* * Reboot. In case there is an additional argument, take it as delay in * seconds. Default to 15s if we cannot parse it and make sure we will * never wait longer than 1 week. Some code is similar to * kern_shutdown.c:shutdown_panic(). */ #ifndef DB_RESET_MAXDELAY #define DB_RESET_MAXDELAY (3600 * 24 * 7) #endif static void db_reset(db_expr_t addr, bool have_addr, db_expr_t count __unused, char *modif __unused) { int delay, loop; if (have_addr) { delay = (int)db_hex2dec(addr); /* If we parse to fail, use 15s. */ if (delay == -1) delay = 15; /* Cap at one week. */ if ((uintmax_t)delay > (uintmax_t)DB_RESET_MAXDELAY) delay = DB_RESET_MAXDELAY; db_printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", delay); for (loop = delay * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) return; } } cpu_reset(); } static void db_watchdog(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t old_radix, tout; int err, i; old_radix = db_radix; db_radix = 10; err = db_expression(&tout); db_skip_to_eol(); db_radix = old_radix; /* If no argument is provided the watchdog will just be disabled. */ if (err == 0) { db_printf("No argument provided, disabling watchdog\n"); tout = 0; } else if ((tout & WD_INTERVAL) == WD_TO_NEVER) { db_error("Out of range watchdog interval\n"); return; } EVENTHANDLER_INVOKE(watchdog_list, tout, &i); } static void db_gdb(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { if (kdb_dbbe_select("gdb") != 0) { db_printf("The remote GDB backend could not be selected.\n"); return; } /* * Mark that we are done in the debugger. kdb_trap() * should re-enter with the new backend. */ db_cmd_loop_done = 1; db_printf("(ctrl-c will return control to ddb)\n"); } static void db_stack_trace(db_expr_t tid, bool hastid, db_expr_t count, char *modif) { struct thread *td; db_expr_t radix; pid_t pid; int t; /* * We parse our own arguments. We don't like the default radix. */ radix = db_radix; db_radix = 10; hastid = db_expression(&tid); t = db_read_token(); if (t == tCOMMA) { if (!db_expression(&count)) { db_printf("Count missing\n"); db_flush_lex(); db_radix = radix; return; } } else { db_unread_token(t); count = -1; } db_skip_to_eol(); db_radix = radix; if (hastid) { td = kdb_thr_lookup((lwpid_t)tid); if (td == NULL) td = kdb_thr_from_pid((pid_t)tid); if (td == NULL) { db_printf("Thread %d not found\n", (int)tid); return; } } else td = kdb_thread; if (td->td_proc != NULL) pid = td->td_proc->p_pid; else pid = -1; db_printf("Tracing pid %d tid %ld td %p\n", pid, (long)td->td_tid, td); if (td->td_proc != NULL && (td->td_proc->p_flag & P_INMEM) == 0) db_printf("--- swapped out\n"); else db_trace_thread(td, count); } static void _db_stack_trace_all(bool active_only) { struct thread *td; jmp_buf jb; void *prev_jb; for (td = kdb_thr_first(); td != NULL; td = kdb_thr_next(td)) { prev_jb = kdb_jmpbuf(jb); if (setjmp(jb) == 0) { if (TD_IS_RUNNING(td)) db_printf("\nTracing command %s pid %d" " tid %ld td %p (CPU %d)\n", td->td_proc->p_comm, td->td_proc->p_pid, (long)td->td_tid, td, td->td_oncpu); else if (active_only) continue; else db_printf("\nTracing command %s pid %d" " tid %ld td %p\n", td->td_proc->p_comm, td->td_proc->p_pid, (long)td->td_tid, td); if (td->td_proc->p_flag & P_INMEM) db_trace_thread(td, -1); else db_printf("--- swapped out\n"); if (db_pager_quit) { kdb_jmpbuf(prev_jb); return; } } kdb_jmpbuf(prev_jb); } } static void db_stack_trace_active(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { _db_stack_trace_all(true); } static void db_stack_trace_all(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { _db_stack_trace_all(false); } /* * Take the parsed expression value from the command line that was parsed * as a hexadecimal value and convert it as if the expression was parsed * as a decimal value. Returns -1 if the expression was not a valid * decimal value. */ db_expr_t db_hex2dec(db_expr_t expr) { uintptr_t x, y; db_expr_t val; y = 1; val = 0; x = expr; while (x != 0) { if (x % 16 > 9) return (-1); val += (x % 16) * (y); x >>= 4; y *= 10; } return (val); } diff --git a/sys/ddb/db_watch.c b/sys/ddb/db_watch.c index 3226b050a4c3..62585731e82e 100644 --- a/sys/ddb/db_watch.c +++ b/sys/ddb/db_watch.c @@ -1,337 +1,337 @@ /*- * SPDX-License-Identifier: MIT-CMU * * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Author: Richard P. Draves, Carnegie Mellon University * Date: 10/90 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* * Watchpoints. */ static bool db_watchpoints_inserted = true; #define NWATCHPOINTS 100 static struct db_watchpoint db_watch_table[NWATCHPOINTS]; static db_watchpoint_t db_next_free_watchpoint = &db_watch_table[0]; static db_watchpoint_t db_free_watchpoints = 0; static db_watchpoint_t db_watchpoint_list = 0; static db_watchpoint_t db_watchpoint_alloc(void); static void db_watchpoint_free(db_watchpoint_t watch); static void db_delete_watchpoint(vm_map_t map, db_addr_t addr); #ifdef notused static bool db_find_watchpoint(vm_map_t map, db_addr_t addr, db_regs_t *regs); #endif static void db_list_watchpoints(void); static void db_set_watchpoint(vm_map_t map, db_addr_t addr, vm_size_t size); static db_watchpoint_t db_watchpoint_alloc(void) { db_watchpoint_t watch; if ((watch = db_free_watchpoints) != 0) { db_free_watchpoints = watch->link; return (watch); } if (db_next_free_watchpoint == &db_watch_table[NWATCHPOINTS]) { db_printf("All watchpoints used.\n"); return (0); } watch = db_next_free_watchpoint; db_next_free_watchpoint++; return (watch); } static void db_watchpoint_free(db_watchpoint_t watch) { watch->link = db_free_watchpoints; db_free_watchpoints = watch; } static void db_set_watchpoint(vm_map_t map, db_addr_t addr, vm_size_t size) { db_watchpoint_t watch; if (map == NULL) { db_printf("No map.\n"); return; } /* * Should we do anything fancy with overlapping regions? */ for (watch = db_watchpoint_list; watch != 0; watch = watch->link) if (db_map_equal(watch->map, map) && (watch->loaddr == addr) && (watch->hiaddr == addr+size)) { db_printf("Already set.\n"); return; } watch = db_watchpoint_alloc(); if (watch == 0) { db_printf("Too many watchpoints.\n"); return; } watch->map = map; watch->loaddr = addr; watch->hiaddr = addr+size; watch->link = db_watchpoint_list; db_watchpoint_list = watch; db_watchpoints_inserted = false; } static void db_delete_watchpoint(vm_map_t map, db_addr_t addr) { db_watchpoint_t watch; db_watchpoint_t *prev; for (prev = &db_watchpoint_list; (watch = *prev) != 0; prev = &watch->link) if (db_map_equal(watch->map, map) && (watch->loaddr <= addr) && (addr < watch->hiaddr)) { *prev = watch->link; db_watchpoint_free(watch); return; } db_printf("Not set.\n"); } static void db_list_watchpoints(void) { db_watchpoint_t watch; if (db_watchpoint_list == 0) { db_printf("No watchpoints set\n"); return; } #ifdef __LP64__ db_printf(" Map Address Size\n"); #else db_printf(" Map Address Size\n"); #endif for (watch = db_watchpoint_list; watch != 0; watch = watch->link) #ifdef __LP64__ db_printf("%s%16p %16lx %lx\n", #else db_printf("%s%8p %8lx %lx\n", #endif db_map_current(watch->map) ? "*" : " ", (void *)watch->map, (long)watch->loaddr, (long)watch->hiaddr - (long)watch->loaddr); } /* Delete watchpoint */ /*ARGSUSED*/ void db_deletewatch_cmd(db_expr_t addr, bool have_addr, db_expr_t count, char *modif) { db_delete_watchpoint(db_map_addr(addr), addr); } /* Set watchpoint */ /*ARGSUSED*/ void db_watchpoint_cmd(db_expr_t addr, bool have_addr, db_expr_t count, char *modif) { vm_size_t size; db_expr_t value; if (db_expression(&value)) size = (vm_size_t) value; else size = 4; db_skip_to_eol(); db_set_watchpoint(db_map_addr(addr), addr, size); } /* * At least one non-optional show-command must be implemented using * DB_SHOW_COMMAND() so that db_show_cmd_set gets created. Here is one. */ -DB_SHOW_COMMAND(watches, db_listwatch_cmd) +DB_SHOW_COMMAND_FLAGS(watches, db_listwatch_cmd, DB_CMD_MEMSAFE) { db_list_watchpoints(); db_md_list_watchpoints(); } void db_set_watchpoints(void) { db_watchpoint_t watch; if (!db_watchpoints_inserted) { for (watch = db_watchpoint_list; watch != 0; watch = watch->link) pmap_protect(watch->map->pmap, trunc_page(watch->loaddr), round_page(watch->hiaddr), VM_PROT_READ); db_watchpoints_inserted = true; } } void db_clear_watchpoints(void) { db_watchpoints_inserted = false; } #ifdef notused static bool db_find_watchpoint(vm_map_t map, db_addr_t addr, db_regs_t regs) { db_watchpoint_t watch; db_watchpoint_t found = 0; for (watch = db_watchpoint_list; watch != 0; watch = watch->link) if (db_map_equal(watch->map, map)) { if ((watch->loaddr <= addr) && (addr < watch->hiaddr)) return (true); else if ((trunc_page(watch->loaddr) <= addr) && (addr < round_page(watch->hiaddr))) found = watch; } /* * We didn't hit exactly on a watchpoint, but we are * in a protected region. We want to single-step * and then re-protect. */ if (found) { db_watchpoints_inserted = false; db_single_step(regs); } return (false); } #endif /* Delete hardware watchpoint */ void db_deletehwatch_cmd(db_expr_t addr, bool have_addr, db_expr_t size, char *modif) { int rc; if (size < 0) size = 4; rc = kdb_cpu_clr_watchpoint((vm_offset_t)addr, (vm_size_t)size); switch (rc) { case ENXIO: /* Not supported, ignored. */ break; case EINVAL: db_printf("Invalid watchpoint address or size.\n"); break; default: if (rc != 0) db_printf("Hardware watchpoint could not be deleted, " "status=%d\n", rc); break; } } /* Set hardware watchpoint */ void db_hwatchpoint_cmd(db_expr_t addr, bool have_addr, db_expr_t size, char *modif) { int rc; if (size < 0) size = 4; rc = kdb_cpu_set_watchpoint((vm_offset_t)addr, (vm_size_t)size, KDB_DBG_ACCESS_W); switch (rc) { case EINVAL: db_printf("Invalid watchpoint size or address.\n"); break; case EBUSY: db_printf("No hardware watchpoints available.\n"); break; case ENXIO: db_printf("Hardware watchpoints are not supported on this platform.\n"); break; default: if (rc != 0) db_printf("Could not set hardware watchpoint, " "status=%d\n", rc); } } diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index 6c09212c2656..34eab9fdd2bb 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -1,6839 +1,6839 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * Copyright (c) 2000, Michael Smith * Copyright (c) 2000, BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_iommu.h" #include "opt_bus.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) #include #endif #include #include #include #include #ifdef PCI_IOV #include #include #endif #include #include #include #include #include #include "pcib_if.h" #include "pci_if.h" #define PCIR_IS_BIOS(cfg, reg) \ (((cfg)->hdrtype == PCIM_HDRTYPE_NORMAL && reg == PCIR_BIOS) || \ ((cfg)->hdrtype == PCIM_HDRTYPE_BRIDGE && reg == PCIR_BIOS_1)) static int pci_has_quirk(uint32_t devid, int quirk); static pci_addr_t pci_mapbase(uint64_t mapreg); static const char *pci_maptype(uint64_t mapreg); static int pci_maprange(uint64_t mapreg); static pci_addr_t pci_rombase(uint64_t mapreg); static int pci_romsize(uint64_t testval); static void pci_fixancient(pcicfgregs *cfg); static int pci_printf(pcicfgregs *cfg, const char *fmt, ...); static int pci_porten(device_t dev); static int pci_memen(device_t dev); static void pci_assign_interrupt(device_t bus, device_t dev, int force_route); static int pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl, int force, int prefetch); static int pci_probe(device_t dev); static void pci_load_vendor_data(void); static int pci_describe_parse_line(char **ptr, int *vendor, int *device, char **desc); static char *pci_describe_device(device_t dev); static int pci_modevent(module_t mod, int what, void *arg); static void pci_hdrtypedata(device_t pcib, int b, int s, int f, pcicfgregs *cfg); static void pci_read_cap(device_t pcib, pcicfgregs *cfg); static int pci_read_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t *data); #if 0 static int pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t data); #endif static void pci_read_vpd(device_t pcib, pcicfgregs *cfg); static void pci_mask_msix(device_t dev, u_int index); static void pci_unmask_msix(device_t dev, u_int index); static int pci_msi_blacklisted(void); static int pci_msix_blacklisted(void); static void pci_resume_msi(device_t dev); static void pci_resume_msix(device_t dev); static int pci_remap_intr_method(device_t bus, device_t dev, u_int irq); static void pci_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp); static int pci_reset_post(device_t dev, device_t child); static int pci_reset_prepare(device_t dev, device_t child); static int pci_reset_child(device_t dev, device_t child, int flags); static int pci_get_id_method(device_t dev, device_t child, enum pci_id_type type, uintptr_t *rid); static struct pci_devinfo * pci_fill_devinfo(device_t pcib, device_t bus, int d, int b, int s, int f, uint16_t vid, uint16_t did); static device_method_t pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, pci_probe), DEVMETHOD(device_attach, pci_attach), DEVMETHOD(device_detach, pci_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, pci_resume), /* Bus interface */ DEVMETHOD(bus_print_child, pci_print_child), DEVMETHOD(bus_probe_nomatch, pci_probe_nomatch), DEVMETHOD(bus_read_ivar, pci_read_ivar), DEVMETHOD(bus_write_ivar, pci_write_ivar), DEVMETHOD(bus_driver_added, pci_driver_added), DEVMETHOD(bus_setup_intr, pci_setup_intr), DEVMETHOD(bus_teardown_intr, pci_teardown_intr), DEVMETHOD(bus_reset_prepare, pci_reset_prepare), DEVMETHOD(bus_reset_post, pci_reset_post), DEVMETHOD(bus_reset_child, pci_reset_child), DEVMETHOD(bus_get_dma_tag, pci_get_dma_tag), DEVMETHOD(bus_get_resource_list,pci_get_resource_list), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_delete_resource, pci_delete_resource), DEVMETHOD(bus_alloc_resource, pci_alloc_resource), DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource), DEVMETHOD(bus_release_resource, pci_release_resource), DEVMETHOD(bus_activate_resource, pci_activate_resource), DEVMETHOD(bus_deactivate_resource, pci_deactivate_resource), DEVMETHOD(bus_child_deleted, pci_child_deleted), DEVMETHOD(bus_child_detached, pci_child_detached), DEVMETHOD(bus_child_pnpinfo, pci_child_pnpinfo_method), DEVMETHOD(bus_child_location, pci_child_location_method), DEVMETHOD(bus_get_device_path, pci_get_device_path_method), DEVMETHOD(bus_hint_device_unit, pci_hint_device_unit), DEVMETHOD(bus_remap_intr, pci_remap_intr_method), DEVMETHOD(bus_suspend_child, pci_suspend_child), DEVMETHOD(bus_resume_child, pci_resume_child), DEVMETHOD(bus_rescan, pci_rescan_method), /* PCI interface */ DEVMETHOD(pci_read_config, pci_read_config_method), DEVMETHOD(pci_write_config, pci_write_config_method), DEVMETHOD(pci_enable_busmaster, pci_enable_busmaster_method), DEVMETHOD(pci_disable_busmaster, pci_disable_busmaster_method), DEVMETHOD(pci_enable_io, pci_enable_io_method), DEVMETHOD(pci_disable_io, pci_disable_io_method), DEVMETHOD(pci_get_vpd_ident, pci_get_vpd_ident_method), DEVMETHOD(pci_get_vpd_readonly, pci_get_vpd_readonly_method), DEVMETHOD(pci_get_powerstate, pci_get_powerstate_method), DEVMETHOD(pci_set_powerstate, pci_set_powerstate_method), DEVMETHOD(pci_assign_interrupt, pci_assign_interrupt_method), DEVMETHOD(pci_find_cap, pci_find_cap_method), DEVMETHOD(pci_find_next_cap, pci_find_next_cap_method), DEVMETHOD(pci_find_extcap, pci_find_extcap_method), DEVMETHOD(pci_find_next_extcap, pci_find_next_extcap_method), DEVMETHOD(pci_find_htcap, pci_find_htcap_method), DEVMETHOD(pci_find_next_htcap, pci_find_next_htcap_method), DEVMETHOD(pci_alloc_msi, pci_alloc_msi_method), DEVMETHOD(pci_alloc_msix, pci_alloc_msix_method), DEVMETHOD(pci_enable_msi, pci_enable_msi_method), DEVMETHOD(pci_enable_msix, pci_enable_msix_method), DEVMETHOD(pci_disable_msi, pci_disable_msi_method), DEVMETHOD(pci_remap_msix, pci_remap_msix_method), DEVMETHOD(pci_release_msi, pci_release_msi_method), DEVMETHOD(pci_msi_count, pci_msi_count_method), DEVMETHOD(pci_msix_count, pci_msix_count_method), DEVMETHOD(pci_msix_pba_bar, pci_msix_pba_bar_method), DEVMETHOD(pci_msix_table_bar, pci_msix_table_bar_method), DEVMETHOD(pci_get_id, pci_get_id_method), DEVMETHOD(pci_alloc_devinfo, pci_alloc_devinfo_method), DEVMETHOD(pci_child_added, pci_child_added_method), #ifdef PCI_IOV DEVMETHOD(pci_iov_attach, pci_iov_attach_method), DEVMETHOD(pci_iov_detach, pci_iov_detach_method), DEVMETHOD(pci_create_iov_child, pci_create_iov_child_method), #endif DEVMETHOD_END }; DEFINE_CLASS_0(pci, pci_driver, pci_methods, sizeof(struct pci_softc)); EARLY_DRIVER_MODULE(pci, pcib, pci_driver, pci_modevent, NULL, BUS_PASS_BUS); MODULE_VERSION(pci, 1); static char *pci_vendordata; static size_t pci_vendordata_size; struct pci_quirk { uint32_t devid; /* Vendor/device of the card */ int type; #define PCI_QUIRK_MAP_REG 1 /* PCI map register in weird place */ #define PCI_QUIRK_DISABLE_MSI 2 /* Neither MSI nor MSI-X work */ #define PCI_QUIRK_ENABLE_MSI_VM 3 /* Older chipset in VM where MSI works */ #define PCI_QUIRK_UNMAP_REG 4 /* Ignore PCI map register */ #define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */ #define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */ #define PCI_QUIRK_REALLOC_BAR 7 /* Can't allocate memory at the default address */ int arg1; int arg2; }; static const struct pci_quirk pci_quirks[] = { /* The Intel 82371AB and 82443MX have a map register at offset 0x90. */ { 0x71138086, PCI_QUIRK_MAP_REG, 0x90, 0 }, { 0x719b8086, PCI_QUIRK_MAP_REG, 0x90, 0 }, /* As does the Serverworks OSB4 (the SMBus mapping register) */ { 0x02001166, PCI_QUIRK_MAP_REG, 0x90, 0 }, /* * MSI doesn't work with the ServerWorks CNB20-HE Host Bridge * or the CMIC-SL (AKA ServerWorks GC_LE). */ { 0x00141166, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x00171166, PCI_QUIRK_DISABLE_MSI, 0, 0 }, /* * MSI doesn't work on earlier Intel chipsets including * E7500, E7501, E7505, 845, 865, 875/E7210, and 855. */ { 0x25408086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x254c8086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x25508086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x25608086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x25708086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x25788086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, { 0x35808086, PCI_QUIRK_DISABLE_MSI, 0, 0 }, /* * MSI doesn't work with devices behind the AMD 8131 HT-PCIX * bridge. */ { 0x74501022, PCI_QUIRK_DISABLE_MSI, 0, 0 }, /* * Some virtualization environments emulate an older chipset * but support MSI just fine. QEMU uses the Intel 82440. */ { 0x12378086, PCI_QUIRK_ENABLE_MSI_VM, 0, 0 }, /* * HPET MMIO base address may appear in Bar1 for AMD SB600 SMBus * controller depending on SoftPciRst register (PM_IO 0x55 [7]). * It prevents us from attaching hpet(4) when the bit is unset. * Note this quirk only affects SB600 revision A13 and earlier. * For SB600 A21 and later, firmware must set the bit to hide it. * For SB700 and later, it is unused and hardcoded to zero. */ { 0x43851002, PCI_QUIRK_UNMAP_REG, 0x14, 0 }, /* * Atheros AR8161/AR8162/E2200/E2400/E2500 Ethernet controllers have * a bug that MSI interrupt does not assert if PCIM_CMD_INTxDIS bit * of the command register is set. */ { 0x10911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, { 0xE0911969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, { 0xE0A11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, { 0xE0B11969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, { 0x10901969, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* * Broadcom BCM5714(S)/BCM5715(S)/BCM5780(S) Ethernet MACs don't * issue MSI interrupts with PCIM_CMD_INTxDIS set either. */ { 0x166814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714 */ { 0x166914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5714S */ { 0x166a14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780 */ { 0x166b14e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5780S */ { 0x167814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715 */ { 0x167914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715S */ /* * HPE Gen 10 VGA has a memory range that can't be allocated in the * expected place. */ { 0x98741002, PCI_QUIRK_REALLOC_BAR, 0, 0 }, { 0 } }; /* map register information */ #define PCI_MAPMEM 0x01 /* memory map */ #define PCI_MAPMEMP 0x02 /* prefetchable memory map */ #define PCI_MAPPORT 0x04 /* port map */ struct devlist pci_devq; uint32_t pci_generation; uint32_t pci_numdevs = 0; static int pcie_chipset, pcix_chipset; /* sysctl vars */ SYSCTL_NODE(_hw, OID_AUTO, pci, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "PCI bus tuning parameters"); static int pci_enable_io_modes = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_io_modes, CTLFLAG_RWTUN, &pci_enable_io_modes, 1, "Enable I/O and memory bits in the config register. Some BIOSes do not" " enable these bits correctly. We'd like to do this all the time, but" " there are some peripherals that this causes problems with."); static int pci_do_realloc_bars = 1; SYSCTL_INT(_hw_pci, OID_AUTO, realloc_bars, CTLFLAG_RWTUN, &pci_do_realloc_bars, 0, "Attempt to allocate a new range for any BARs whose original " "firmware-assigned ranges fail to allocate during the initial device scan."); static int pci_do_power_nodriver = 0; SYSCTL_INT(_hw_pci, OID_AUTO, do_power_nodriver, CTLFLAG_RWTUN, &pci_do_power_nodriver, 0, "Place a function into D3 state when no driver attaches to it. 0 means" " disable. 1 means conservatively place devices into D3 state. 2 means" " aggressively place devices into D3 state. 3 means put absolutely" " everything in D3 state."); int pci_do_power_resume = 1; SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RWTUN, &pci_do_power_resume, 1, "Transition from D3 -> D0 on resume."); int pci_do_power_suspend = 1; SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RWTUN, &pci_do_power_suspend, 1, "Transition from D0 -> D3 on suspend."); static int pci_do_msi = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RWTUN, &pci_do_msi, 1, "Enable support for MSI interrupts"); static int pci_do_msix = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_msix, CTLFLAG_RWTUN, &pci_do_msix, 1, "Enable support for MSI-X interrupts"); static int pci_msix_rewrite_table = 0; SYSCTL_INT(_hw_pci, OID_AUTO, msix_rewrite_table, CTLFLAG_RWTUN, &pci_msix_rewrite_table, 0, "Rewrite entire MSI-X table when updating MSI-X entries"); static int pci_honor_msi_blacklist = 1; SYSCTL_INT(_hw_pci, OID_AUTO, honor_msi_blacklist, CTLFLAG_RDTUN, &pci_honor_msi_blacklist, 1, "Honor chipset blacklist for MSI/MSI-X"); #if defined(__i386__) || defined(__amd64__) static int pci_usb_takeover = 1; #else static int pci_usb_takeover = 0; #endif SYSCTL_INT(_hw_pci, OID_AUTO, usb_early_takeover, CTLFLAG_RDTUN, &pci_usb_takeover, 1, "Enable early takeover of USB controllers. Disable this if you depend on" " BIOS emulation of USB devices, that is you use USB devices (like" " keyboard or mouse) but do not load USB drivers"); static int pci_clear_bars; SYSCTL_INT(_hw_pci, OID_AUTO, clear_bars, CTLFLAG_RDTUN, &pci_clear_bars, 0, "Ignore firmware-assigned resources for BARs."); #if defined(NEW_PCIB) && defined(PCI_RES_BUS) static int pci_clear_buses; SYSCTL_INT(_hw_pci, OID_AUTO, clear_buses, CTLFLAG_RDTUN, &pci_clear_buses, 0, "Ignore firmware-assigned bus numbers."); #endif static int pci_enable_ari = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_ari, CTLFLAG_RDTUN, &pci_enable_ari, 0, "Enable support for PCIe Alternative RID Interpretation"); int pci_enable_aspm = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_aspm, CTLFLAG_RDTUN, &pci_enable_aspm, 0, "Enable support for PCIe Active State Power Management"); static int pci_clear_aer_on_attach = 0; SYSCTL_INT(_hw_pci, OID_AUTO, clear_aer_on_attach, CTLFLAG_RWTUN, &pci_clear_aer_on_attach, 0, "Clear port and device AER state on driver attach"); static int pci_has_quirk(uint32_t devid, int quirk) { const struct pci_quirk *q; for (q = &pci_quirks[0]; q->devid; q++) { if (q->devid == devid && q->type == quirk) return (1); } return (0); } /* Find a device_t by bus/slot/function in domain 0 */ device_t pci_find_bsf(uint8_t bus, uint8_t slot, uint8_t func) { return (pci_find_dbsf(0, bus, slot, func)); } /* Find a device_t by domain/bus/slot/function */ device_t pci_find_dbsf(uint32_t domain, uint8_t bus, uint8_t slot, uint8_t func) { struct pci_devinfo *dinfo = NULL; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { if ((dinfo->cfg.domain == domain) && (dinfo->cfg.bus == bus) && (dinfo->cfg.slot == slot) && (dinfo->cfg.func == func)) { break; } } return (dinfo != NULL ? dinfo->cfg.dev : NULL); } /* Find a device_t by vendor/device ID */ device_t pci_find_device(uint16_t vendor, uint16_t device) { struct pci_devinfo *dinfo; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { if ((dinfo->cfg.vendor == vendor) && (dinfo->cfg.device == device)) { return (dinfo->cfg.dev); } } return (NULL); } device_t pci_find_class(uint8_t class, uint8_t subclass) { struct pci_devinfo *dinfo; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { if (dinfo->cfg.baseclass == class && dinfo->cfg.subclass == subclass) { return (dinfo->cfg.dev); } } return (NULL); } device_t pci_find_class_from(uint8_t class, uint8_t subclass, device_t from) { struct pci_devinfo *dinfo; bool found = false; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { if (from != NULL && found == false) { if (from != dinfo->cfg.dev) continue; found = true; continue; } if (dinfo->cfg.baseclass == class && dinfo->cfg.subclass == subclass) { return (dinfo->cfg.dev); } } return (NULL); } static int pci_printf(pcicfgregs *cfg, const char *fmt, ...) { va_list ap; int retval; retval = printf("pci%d:%d:%d:%d: ", cfg->domain, cfg->bus, cfg->slot, cfg->func); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } /* return base address of memory or port map */ static pci_addr_t pci_mapbase(uint64_t mapreg) { if (PCI_BAR_MEM(mapreg)) return (mapreg & PCIM_BAR_MEM_BASE); else return (mapreg & PCIM_BAR_IO_BASE); } /* return map type of memory or port map */ static const char * pci_maptype(uint64_t mapreg) { if (PCI_BAR_IO(mapreg)) return ("I/O Port"); if (mapreg & PCIM_BAR_MEM_PREFETCH) return ("Prefetchable Memory"); return ("Memory"); } /* return log2 of map size decoded for memory or port map */ int pci_mapsize(uint64_t testval) { int ln2size; testval = pci_mapbase(testval); ln2size = 0; if (testval != 0) { while ((testval & 1) == 0) { ln2size++; testval >>= 1; } } return (ln2size); } /* return base address of device ROM */ static pci_addr_t pci_rombase(uint64_t mapreg) { return (mapreg & PCIM_BIOS_ADDR_MASK); } /* return log2 of map size decided for device ROM */ static int pci_romsize(uint64_t testval) { int ln2size; testval = pci_rombase(testval); ln2size = 0; if (testval != 0) { while ((testval & 1) == 0) { ln2size++; testval >>= 1; } } return (ln2size); } /* return log2 of address range supported by map register */ static int pci_maprange(uint64_t mapreg) { int ln2range = 0; if (PCI_BAR_IO(mapreg)) ln2range = 32; else switch (mapreg & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_32: ln2range = 32; break; case PCIM_BAR_MEM_1MB: ln2range = 20; break; case PCIM_BAR_MEM_64: ln2range = 64; break; } return (ln2range); } /* adjust some values from PCI 1.0 devices to match 2.0 standards ... */ static void pci_fixancient(pcicfgregs *cfg) { if ((cfg->hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) return; /* PCI to PCI bridges use header type 1 */ if (cfg->baseclass == PCIC_BRIDGE && cfg->subclass == PCIS_BRIDGE_PCI) cfg->hdrtype = PCIM_HDRTYPE_BRIDGE; } /* extract header type specific config data */ static void pci_hdrtypedata(device_t pcib, int b, int s, int f, pcicfgregs *cfg) { #define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w) switch (cfg->hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_NORMAL: cfg->subvendor = REG(PCIR_SUBVEND_0, 2); cfg->subdevice = REG(PCIR_SUBDEV_0, 2); cfg->mingnt = REG(PCIR_MINGNT, 1); cfg->maxlat = REG(PCIR_MAXLAT, 1); cfg->nummaps = PCI_MAXMAPS_0; break; case PCIM_HDRTYPE_BRIDGE: cfg->bridge.br_seclat = REG(PCIR_SECLAT_1, 1); cfg->bridge.br_subbus = REG(PCIR_SUBBUS_1, 1); cfg->bridge.br_secbus = REG(PCIR_SECBUS_1, 1); cfg->bridge.br_pribus = REG(PCIR_PRIBUS_1, 1); cfg->bridge.br_control = REG(PCIR_BRIDGECTL_1, 2); cfg->nummaps = PCI_MAXMAPS_1; break; case PCIM_HDRTYPE_CARDBUS: cfg->bridge.br_seclat = REG(PCIR_SECLAT_2, 1); cfg->bridge.br_subbus = REG(PCIR_SUBBUS_2, 1); cfg->bridge.br_secbus = REG(PCIR_SECBUS_2, 1); cfg->bridge.br_pribus = REG(PCIR_PRIBUS_2, 1); cfg->bridge.br_control = REG(PCIR_BRIDGECTL_2, 2); cfg->subvendor = REG(PCIR_SUBVEND_2, 2); cfg->subdevice = REG(PCIR_SUBDEV_2, 2); cfg->nummaps = PCI_MAXMAPS_2; break; } #undef REG } /* read configuration header into pcicfgregs structure */ struct pci_devinfo * pci_read_device(device_t pcib, device_t bus, int d, int b, int s, int f) { #define REG(n, w) PCIB_READ_CONFIG(pcib, b, s, f, n, w) uint16_t vid, did; vid = REG(PCIR_VENDOR, 2); if (vid == PCIV_INVALID) return (NULL); did = REG(PCIR_DEVICE, 2); return (pci_fill_devinfo(pcib, bus, d, b, s, f, vid, did)); } struct pci_devinfo * pci_alloc_devinfo_method(device_t dev) { return (malloc(sizeof(struct pci_devinfo), M_DEVBUF, M_WAITOK | M_ZERO)); } static struct pci_devinfo * pci_fill_devinfo(device_t pcib, device_t bus, int d, int b, int s, int f, uint16_t vid, uint16_t did) { struct pci_devinfo *devlist_entry; pcicfgregs *cfg; devlist_entry = PCI_ALLOC_DEVINFO(bus); cfg = &devlist_entry->cfg; cfg->domain = d; cfg->bus = b; cfg->slot = s; cfg->func = f; cfg->vendor = vid; cfg->device = did; cfg->cmdreg = REG(PCIR_COMMAND, 2); cfg->statreg = REG(PCIR_STATUS, 2); cfg->baseclass = REG(PCIR_CLASS, 1); cfg->subclass = REG(PCIR_SUBCLASS, 1); cfg->progif = REG(PCIR_PROGIF, 1); cfg->revid = REG(PCIR_REVID, 1); cfg->hdrtype = REG(PCIR_HDRTYPE, 1); cfg->cachelnsz = REG(PCIR_CACHELNSZ, 1); cfg->lattimer = REG(PCIR_LATTIMER, 1); cfg->intpin = REG(PCIR_INTPIN, 1); cfg->intline = REG(PCIR_INTLINE, 1); cfg->mfdev = (cfg->hdrtype & PCIM_MFDEV) != 0; cfg->hdrtype &= ~PCIM_MFDEV; STAILQ_INIT(&cfg->maps); cfg->iov = NULL; pci_fixancient(cfg); pci_hdrtypedata(pcib, b, s, f, cfg); if (REG(PCIR_STATUS, 2) & PCIM_STATUS_CAPPRESENT) pci_read_cap(pcib, cfg); STAILQ_INSERT_TAIL(&pci_devq, devlist_entry, pci_links); devlist_entry->conf.pc_sel.pc_domain = cfg->domain; devlist_entry->conf.pc_sel.pc_bus = cfg->bus; devlist_entry->conf.pc_sel.pc_dev = cfg->slot; devlist_entry->conf.pc_sel.pc_func = cfg->func; devlist_entry->conf.pc_hdr = cfg->hdrtype; devlist_entry->conf.pc_subvendor = cfg->subvendor; devlist_entry->conf.pc_subdevice = cfg->subdevice; devlist_entry->conf.pc_vendor = cfg->vendor; devlist_entry->conf.pc_device = cfg->device; devlist_entry->conf.pc_class = cfg->baseclass; devlist_entry->conf.pc_subclass = cfg->subclass; devlist_entry->conf.pc_progif = cfg->progif; devlist_entry->conf.pc_revid = cfg->revid; pci_numdevs++; pci_generation++; return (devlist_entry); } #undef REG static void pci_ea_fill_info(device_t pcib, pcicfgregs *cfg) { #define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, \ cfg->ea.ea_location + (n), w) int num_ent; int ptr; int a, b; uint32_t val; int ent_size; uint32_t dw[4]; uint64_t base, max_offset; struct pci_ea_entry *eae; if (cfg->ea.ea_location == 0) return; STAILQ_INIT(&cfg->ea.ea_entries); /* Determine the number of entries */ num_ent = REG(PCIR_EA_NUM_ENT, 2); num_ent &= PCIM_EA_NUM_ENT_MASK; /* Find the first entry to care of */ ptr = PCIR_EA_FIRST_ENT; /* Skip DWORD 2 for type 1 functions */ if ((cfg->hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_BRIDGE) ptr += 4; for (a = 0; a < num_ent; a++) { eae = malloc(sizeof(*eae), M_DEVBUF, M_WAITOK | M_ZERO); eae->eae_cfg_offset = cfg->ea.ea_location + ptr; /* Read a number of dwords in the entry */ val = REG(ptr, 4); ptr += 4; ent_size = (val & PCIM_EA_ES); for (b = 0; b < ent_size; b++) { dw[b] = REG(ptr, 4); ptr += 4; } eae->eae_flags = val; eae->eae_bei = (PCIM_EA_BEI & val) >> PCIM_EA_BEI_OFFSET; base = dw[0] & PCIM_EA_FIELD_MASK; max_offset = dw[1] | ~PCIM_EA_FIELD_MASK; b = 2; if (((dw[0] & PCIM_EA_IS_64) != 0) && (b < ent_size)) { base |= (uint64_t)dw[b] << 32UL; b++; } if (((dw[1] & PCIM_EA_IS_64) != 0) && (b < ent_size)) { max_offset |= (uint64_t)dw[b] << 32UL; b++; } eae->eae_base = base; eae->eae_max_offset = max_offset; STAILQ_INSERT_TAIL(&cfg->ea.ea_entries, eae, eae_link); if (bootverbose) { printf("PCI(EA) dev %04x:%04x, bei %d, flags #%x, base #%jx, max_offset #%jx\n", cfg->vendor, cfg->device, eae->eae_bei, eae->eae_flags, (uintmax_t)eae->eae_base, (uintmax_t)eae->eae_max_offset); } } } #undef REG static void pci_read_cap(device_t pcib, pcicfgregs *cfg) { #define REG(n, w) PCIB_READ_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, w) #define WREG(n, v, w) PCIB_WRITE_CONFIG(pcib, cfg->bus, cfg->slot, cfg->func, n, v, w) #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) uint64_t addr; #endif uint32_t val; int ptr, nextptr, ptrptr; switch (cfg->hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_NORMAL: case PCIM_HDRTYPE_BRIDGE: ptrptr = PCIR_CAP_PTR; break; case PCIM_HDRTYPE_CARDBUS: ptrptr = PCIR_CAP_PTR_2; /* cardbus capabilities ptr */ break; default: return; /* no extended capabilities support */ } nextptr = REG(ptrptr, 1); /* sanity check? */ /* * Read capability entries. */ while (nextptr != 0) { /* Sanity check */ if (nextptr > 255) { printf("illegal PCI extended capability offset %d\n", nextptr); return; } /* Find the next entry */ ptr = nextptr; nextptr = REG(ptr + PCICAP_NEXTPTR, 1); /* Process this entry */ switch (REG(ptr + PCICAP_ID, 1)) { case PCIY_PMG: /* PCI power management */ if (cfg->pp.pp_cap == 0) { cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2); cfg->pp.pp_status = ptr + PCIR_POWER_STATUS; cfg->pp.pp_bse = ptr + PCIR_POWER_BSE; if ((nextptr - ptr) > PCIR_POWER_DATA) cfg->pp.pp_data = ptr + PCIR_POWER_DATA; } break; case PCIY_HT: /* HyperTransport */ /* Determine HT-specific capability type. */ val = REG(ptr + PCIR_HT_COMMAND, 2); if ((val & 0xe000) == PCIM_HTCAP_SLAVE) cfg->ht.ht_slave = ptr; #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) switch (val & PCIM_HTCMD_CAP_MASK) { case PCIM_HTCAP_MSI_MAPPING: if (!(val & PCIM_HTCMD_MSI_FIXED)) { /* Sanity check the mapping window. */ addr = REG(ptr + PCIR_HTMSI_ADDRESS_HI, 4); addr <<= 32; addr |= REG(ptr + PCIR_HTMSI_ADDRESS_LO, 4); if (addr != MSI_INTEL_ADDR_BASE) device_printf(pcib, "HT device at pci%d:%d:%d:%d has non-default MSI window 0x%llx\n", cfg->domain, cfg->bus, cfg->slot, cfg->func, (long long)addr); } else addr = MSI_INTEL_ADDR_BASE; cfg->ht.ht_msimap = ptr; cfg->ht.ht_msictrl = val; cfg->ht.ht_msiaddr = addr; break; } #endif break; case PCIY_MSI: /* PCI MSI */ cfg->msi.msi_location = ptr; cfg->msi.msi_ctrl = REG(ptr + PCIR_MSI_CTRL, 2); cfg->msi.msi_msgnum = 1 << ((cfg->msi.msi_ctrl & PCIM_MSICTRL_MMC_MASK)>>1); break; case PCIY_MSIX: /* PCI MSI-X */ cfg->msix.msix_location = ptr; cfg->msix.msix_ctrl = REG(ptr + PCIR_MSIX_CTRL, 2); cfg->msix.msix_msgnum = (cfg->msix.msix_ctrl & PCIM_MSIXCTRL_TABLE_SIZE) + 1; val = REG(ptr + PCIR_MSIX_TABLE, 4); cfg->msix.msix_table_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK); cfg->msix.msix_table_offset = val & ~PCIM_MSIX_BIR_MASK; val = REG(ptr + PCIR_MSIX_PBA, 4); cfg->msix.msix_pba_bar = PCIR_BAR(val & PCIM_MSIX_BIR_MASK); cfg->msix.msix_pba_offset = val & ~PCIM_MSIX_BIR_MASK; break; case PCIY_VPD: /* PCI Vital Product Data */ cfg->vpd.vpd_reg = ptr; break; case PCIY_SUBVENDOR: /* Should always be true. */ if ((cfg->hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_BRIDGE) { val = REG(ptr + PCIR_SUBVENDCAP_ID, 4); cfg->subvendor = val & 0xffff; cfg->subdevice = val >> 16; } break; case PCIY_PCIX: /* PCI-X */ /* * Assume we have a PCI-X chipset if we have * at least one PCI-PCI bridge with a PCI-X * capability. Note that some systems with * PCI-express or HT chipsets might match on * this check as well. */ if ((cfg->hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_BRIDGE) pcix_chipset = 1; cfg->pcix.pcix_location = ptr; break; case PCIY_EXPRESS: /* PCI-express */ /* * Assume we have a PCI-express chipset if we have * at least one PCI-express device. */ pcie_chipset = 1; cfg->pcie.pcie_location = ptr; val = REG(ptr + PCIER_FLAGS, 2); cfg->pcie.pcie_type = val & PCIEM_FLAGS_TYPE; break; case PCIY_EA: /* Enhanced Allocation */ cfg->ea.ea_location = ptr; pci_ea_fill_info(pcib, cfg); break; default: break; } } #if defined(__powerpc__) /* * Enable the MSI mapping window for all HyperTransport * slaves. PCI-PCI bridges have their windows enabled via * PCIB_MAP_MSI(). */ if (cfg->ht.ht_slave != 0 && cfg->ht.ht_msimap != 0 && !(cfg->ht.ht_msictrl & PCIM_HTCMD_MSI_ENABLE)) { device_printf(pcib, "Enabling MSI window for HyperTransport slave at pci%d:%d:%d:%d\n", cfg->domain, cfg->bus, cfg->slot, cfg->func); cfg->ht.ht_msictrl |= PCIM_HTCMD_MSI_ENABLE; WREG(cfg->ht.ht_msimap + PCIR_HT_COMMAND, cfg->ht.ht_msictrl, 2); } #endif /* REG and WREG use carry through to next functions */ } /* * PCI Vital Product Data */ #define PCI_VPD_TIMEOUT 1000000 static int pci_read_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t *data) { int count = PCI_VPD_TIMEOUT; KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned")); WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg, 2); while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) != 0x8000) { if (--count < 0) return (ENXIO); DELAY(1); /* limit looping */ } *data = (REG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, 4)); return (0); } #if 0 static int pci_write_vpd_reg(device_t pcib, pcicfgregs *cfg, int reg, uint32_t data) { int count = PCI_VPD_TIMEOUT; KASSERT((reg & 3) == 0, ("VPD register must by 4 byte aligned")); WREG(cfg->vpd.vpd_reg + PCIR_VPD_DATA, data, 4); WREG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, reg | 0x8000, 2); while ((REG(cfg->vpd.vpd_reg + PCIR_VPD_ADDR, 2) & 0x8000) == 0x8000) { if (--count < 0) return (ENXIO); DELAY(1); /* limit looping */ } return (0); } #endif #undef PCI_VPD_TIMEOUT struct vpd_readstate { device_t pcib; pcicfgregs *cfg; uint32_t val; int bytesinval; int off; uint8_t cksum; }; static int vpd_nextbyte(struct vpd_readstate *vrs, uint8_t *data) { uint32_t reg; uint8_t byte; if (vrs->bytesinval == 0) { if (pci_read_vpd_reg(vrs->pcib, vrs->cfg, vrs->off, ®)) return (ENXIO); vrs->val = le32toh(reg); vrs->off += 4; byte = vrs->val & 0xff; vrs->bytesinval = 3; } else { vrs->val = vrs->val >> 8; byte = vrs->val & 0xff; vrs->bytesinval--; } vrs->cksum += byte; *data = byte; return (0); } static void pci_read_vpd(device_t pcib, pcicfgregs *cfg) { struct vpd_readstate vrs; int state; int name; int remain; int i; int alloc, off; /* alloc/off for RO/W arrays */ int cksumvalid; int dflen; int firstrecord; uint8_t byte; uint8_t byte2; /* init vpd reader */ vrs.bytesinval = 0; vrs.off = 0; vrs.pcib = pcib; vrs.cfg = cfg; vrs.cksum = 0; state = 0; name = remain = i = 0; /* shut up stupid gcc */ alloc = off = 0; /* shut up stupid gcc */ dflen = 0; /* shut up stupid gcc */ cksumvalid = -1; firstrecord = 1; while (state >= 0) { if (vpd_nextbyte(&vrs, &byte)) { pci_printf(cfg, "VPD read timed out\n"); state = -2; break; } #if 0 pci_printf(cfg, "vpd: val: %#x, off: %d, bytesinval: %d, byte: " "%#hhx, state: %d, remain: %d, name: %#x, i: %d\n", vrs.val, vrs.off, vrs.bytesinval, byte, state, remain, name, i); #endif switch (state) { case 0: /* item name */ if (byte & 0x80) { if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } remain = byte2; if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } remain |= byte2 << 8; name = byte & 0x7f; } else { remain = byte & 0x7; name = (byte >> 3) & 0xf; } if (firstrecord) { if (name != 0x2) { pci_printf(cfg, "VPD data does not " \ "start with ident (%#x)\n", name); state = -2; break; } firstrecord = 0; } if (vrs.off + remain - vrs.bytesinval > 0x8000) { pci_printf(cfg, "VPD data overflow, remain %#x\n", remain); state = -1; break; } switch (name) { case 0x2: /* String */ if (cfg->vpd.vpd_ident != NULL) { pci_printf(cfg, "duplicate VPD ident record\n"); state = -2; break; } if (remain > 255) { pci_printf(cfg, "VPD ident length %d exceeds 255\n", remain); state = -2; break; } cfg->vpd.vpd_ident = malloc(remain + 1, M_DEVBUF, M_WAITOK); i = 0; state = 1; break; case 0xf: /* End */ state = -1; break; case 0x10: /* VPD-R */ alloc = 8; off = 0; cfg->vpd.vpd_ros = malloc(alloc * sizeof(*cfg->vpd.vpd_ros), M_DEVBUF, M_WAITOK | M_ZERO); state = 2; break; case 0x11: /* VPD-W */ alloc = 8; off = 0; cfg->vpd.vpd_w = malloc(alloc * sizeof(*cfg->vpd.vpd_w), M_DEVBUF, M_WAITOK | M_ZERO); state = 5; break; default: /* Invalid data, abort */ pci_printf(cfg, "invalid VPD name: %#x\n", name); state = -2; break; } break; case 1: /* Identifier String */ cfg->vpd.vpd_ident[i++] = byte; remain--; if (remain == 0) { cfg->vpd.vpd_ident[i] = '\0'; state = 0; } break; case 2: /* VPD-R Keyword Header */ if (off == alloc) { cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros, (alloc *= 2) * sizeof(*cfg->vpd.vpd_ros), M_DEVBUF, M_WAITOK | M_ZERO); } cfg->vpd.vpd_ros[off].keyword[0] = byte; if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } cfg->vpd.vpd_ros[off].keyword[1] = byte2; if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } cfg->vpd.vpd_ros[off].len = dflen = byte2; if (dflen == 0 && strncmp(cfg->vpd.vpd_ros[off].keyword, "RV", 2) == 0) { /* * if this happens, we can't trust the rest * of the VPD. */ pci_printf(cfg, "invalid VPD RV record"); cksumvalid = 0; state = -1; break; } else if (dflen == 0) { cfg->vpd.vpd_ros[off].value = malloc(1 * sizeof(*cfg->vpd.vpd_ros[off].value), M_DEVBUF, M_WAITOK); cfg->vpd.vpd_ros[off].value[0] = '\x00'; } else cfg->vpd.vpd_ros[off].value = malloc( (dflen + 1) * sizeof(*cfg->vpd.vpd_ros[off].value), M_DEVBUF, M_WAITOK); remain -= 3; i = 0; /* keep in sync w/ state 3's transitions */ if (dflen == 0 && remain == 0) state = 0; else if (dflen == 0) state = 2; else state = 3; break; case 3: /* VPD-R Keyword Value */ cfg->vpd.vpd_ros[off].value[i++] = byte; if (strncmp(cfg->vpd.vpd_ros[off].keyword, "RV", 2) == 0 && cksumvalid == -1) { if (vrs.cksum == 0) cksumvalid = 1; else { if (bootverbose) pci_printf(cfg, "bad VPD cksum, remain %hhu\n", vrs.cksum); cksumvalid = 0; state = -1; break; } } dflen--; remain--; /* keep in sync w/ state 2's transitions */ if (dflen == 0) cfg->vpd.vpd_ros[off++].value[i++] = '\0'; if (dflen == 0 && remain == 0) { cfg->vpd.vpd_rocnt = off; cfg->vpd.vpd_ros = reallocf(cfg->vpd.vpd_ros, off * sizeof(*cfg->vpd.vpd_ros), M_DEVBUF, M_WAITOK | M_ZERO); state = 0; } else if (dflen == 0) state = 2; break; case 4: remain--; if (remain == 0) state = 0; break; case 5: /* VPD-W Keyword Header */ if (off == alloc) { cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w, (alloc *= 2) * sizeof(*cfg->vpd.vpd_w), M_DEVBUF, M_WAITOK | M_ZERO); } cfg->vpd.vpd_w[off].keyword[0] = byte; if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } cfg->vpd.vpd_w[off].keyword[1] = byte2; if (vpd_nextbyte(&vrs, &byte2)) { state = -2; break; } cfg->vpd.vpd_w[off].len = dflen = byte2; cfg->vpd.vpd_w[off].start = vrs.off - vrs.bytesinval; cfg->vpd.vpd_w[off].value = malloc((dflen + 1) * sizeof(*cfg->vpd.vpd_w[off].value), M_DEVBUF, M_WAITOK); remain -= 3; i = 0; /* keep in sync w/ state 6's transitions */ if (dflen == 0 && remain == 0) state = 0; else if (dflen == 0) state = 5; else state = 6; break; case 6: /* VPD-W Keyword Value */ cfg->vpd.vpd_w[off].value[i++] = byte; dflen--; remain--; /* keep in sync w/ state 5's transitions */ if (dflen == 0) cfg->vpd.vpd_w[off++].value[i++] = '\0'; if (dflen == 0 && remain == 0) { cfg->vpd.vpd_wcnt = off; cfg->vpd.vpd_w = reallocf(cfg->vpd.vpd_w, off * sizeof(*cfg->vpd.vpd_w), M_DEVBUF, M_WAITOK | M_ZERO); state = 0; } else if (dflen == 0) state = 5; break; default: pci_printf(cfg, "invalid state: %d\n", state); state = -1; break; } if (cfg->vpd.vpd_ident == NULL || cfg->vpd.vpd_ident[0] == '\0') { pci_printf(cfg, "no valid vpd ident found\n"); state = -2; } } if (cksumvalid <= 0 || state < -1) { /* read-only data bad, clean up */ if (cfg->vpd.vpd_ros != NULL) { for (off = 0; cfg->vpd.vpd_ros[off].value; off++) free(cfg->vpd.vpd_ros[off].value, M_DEVBUF); free(cfg->vpd.vpd_ros, M_DEVBUF); cfg->vpd.vpd_ros = NULL; } } if (state < -1) { /* I/O error, clean up */ pci_printf(cfg, "failed to read VPD data.\n"); if (cfg->vpd.vpd_ident != NULL) { free(cfg->vpd.vpd_ident, M_DEVBUF); cfg->vpd.vpd_ident = NULL; } if (cfg->vpd.vpd_w != NULL) { for (off = 0; cfg->vpd.vpd_w[off].value; off++) free(cfg->vpd.vpd_w[off].value, M_DEVBUF); free(cfg->vpd.vpd_w, M_DEVBUF); cfg->vpd.vpd_w = NULL; } } cfg->vpd.vpd_cached = 1; #undef REG #undef WREG } int pci_get_vpd_ident_method(device_t dev, device_t child, const char **identptr) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0) pci_read_vpd(device_get_parent(dev), cfg); *identptr = cfg->vpd.vpd_ident; if (*identptr == NULL) return (ENXIO); return (0); } int pci_get_vpd_readonly_method(device_t dev, device_t child, const char *kw, const char **vptr) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; int i; if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0) pci_read_vpd(device_get_parent(dev), cfg); for (i = 0; i < cfg->vpd.vpd_rocnt; i++) if (memcmp(kw, cfg->vpd.vpd_ros[i].keyword, sizeof(cfg->vpd.vpd_ros[i].keyword)) == 0) { *vptr = cfg->vpd.vpd_ros[i].value; return (0); } *vptr = NULL; return (ENXIO); } struct pcicfg_vpd * pci_fetch_vpd_list(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); pcicfgregs *cfg = &dinfo->cfg; if (!cfg->vpd.vpd_cached && cfg->vpd.vpd_reg != 0) pci_read_vpd(device_get_parent(device_get_parent(dev)), cfg); return (&cfg->vpd); } /* * Find the requested HyperTransport capability and return the offset * in configuration space via the pointer provided. The function * returns 0 on success and an error code otherwise. */ int pci_find_htcap_method(device_t dev, device_t child, int capability, int *capreg) { int ptr, error; uint16_t val; error = pci_find_cap(child, PCIY_HT, &ptr); if (error) return (error); /* * Traverse the capabilities list checking each HT capability * to see if it matches the requested HT capability. */ for (;;) { val = pci_read_config(child, ptr + PCIR_HT_COMMAND, 2); if (capability == PCIM_HTCAP_SLAVE || capability == PCIM_HTCAP_HOST) val &= 0xe000; else val &= PCIM_HTCMD_CAP_MASK; if (val == capability) { if (capreg != NULL) *capreg = ptr; return (0); } /* Skip to the next HT capability. */ if (pci_find_next_cap(child, PCIY_HT, ptr, &ptr) != 0) break; } return (ENOENT); } /* * Find the next requested HyperTransport capability after start and return * the offset in configuration space via the pointer provided. The function * returns 0 on success and an error code otherwise. */ int pci_find_next_htcap_method(device_t dev, device_t child, int capability, int start, int *capreg) { int ptr; uint16_t val; KASSERT(pci_read_config(child, start + PCICAP_ID, 1) == PCIY_HT, ("start capability is not HyperTransport capability")); ptr = start; /* * Traverse the capabilities list checking each HT capability * to see if it matches the requested HT capability. */ for (;;) { /* Skip to the next HT capability. */ if (pci_find_next_cap(child, PCIY_HT, ptr, &ptr) != 0) break; val = pci_read_config(child, ptr + PCIR_HT_COMMAND, 2); if (capability == PCIM_HTCAP_SLAVE || capability == PCIM_HTCAP_HOST) val &= 0xe000; else val &= PCIM_HTCMD_CAP_MASK; if (val == capability) { if (capreg != NULL) *capreg = ptr; return (0); } } return (ENOENT); } /* * Find the requested capability and return the offset in * configuration space via the pointer provided. The function returns * 0 on success and an error code otherwise. */ int pci_find_cap_method(device_t dev, device_t child, int capability, int *capreg) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; uint32_t status; uint8_t ptr; /* * Check the CAP_LIST bit of the PCI status register first. */ status = pci_read_config(child, PCIR_STATUS, 2); if (!(status & PCIM_STATUS_CAPPRESENT)) return (ENXIO); /* * Determine the start pointer of the capabilities list. */ switch (cfg->hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_NORMAL: case PCIM_HDRTYPE_BRIDGE: ptr = PCIR_CAP_PTR; break; case PCIM_HDRTYPE_CARDBUS: ptr = PCIR_CAP_PTR_2; break; default: /* XXX: panic? */ return (ENXIO); /* no extended capabilities support */ } ptr = pci_read_config(child, ptr, 1); /* * Traverse the capabilities list. */ while (ptr != 0) { if (pci_read_config(child, ptr + PCICAP_ID, 1) == capability) { if (capreg != NULL) *capreg = ptr; return (0); } ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1); } return (ENOENT); } /* * Find the next requested capability after start and return the offset in * configuration space via the pointer provided. The function returns * 0 on success and an error code otherwise. */ int pci_find_next_cap_method(device_t dev, device_t child, int capability, int start, int *capreg) { uint8_t ptr; KASSERT(pci_read_config(child, start + PCICAP_ID, 1) == capability, ("start capability is not expected capability")); ptr = pci_read_config(child, start + PCICAP_NEXTPTR, 1); while (ptr != 0) { if (pci_read_config(child, ptr + PCICAP_ID, 1) == capability) { if (capreg != NULL) *capreg = ptr; return (0); } ptr = pci_read_config(child, ptr + PCICAP_NEXTPTR, 1); } return (ENOENT); } /* * Find the requested extended capability and return the offset in * configuration space via the pointer provided. The function returns * 0 on success and an error code otherwise. */ int pci_find_extcap_method(device_t dev, device_t child, int capability, int *capreg) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; uint32_t ecap; uint16_t ptr; /* Only supported for PCI-express devices. */ if (cfg->pcie.pcie_location == 0) return (ENXIO); ptr = PCIR_EXTCAP; ecap = pci_read_config(child, ptr, 4); if (ecap == 0xffffffff || ecap == 0) return (ENOENT); for (;;) { if (PCI_EXTCAP_ID(ecap) == capability) { if (capreg != NULL) *capreg = ptr; return (0); } ptr = PCI_EXTCAP_NEXTPTR(ecap); if (ptr == 0) break; ecap = pci_read_config(child, ptr, 4); } return (ENOENT); } /* * Find the next requested extended capability after start and return the * offset in configuration space via the pointer provided. The function * returns 0 on success and an error code otherwise. */ int pci_find_next_extcap_method(device_t dev, device_t child, int capability, int start, int *capreg) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; uint32_t ecap; uint16_t ptr; /* Only supported for PCI-express devices. */ if (cfg->pcie.pcie_location == 0) return (ENXIO); ecap = pci_read_config(child, start, 4); KASSERT(PCI_EXTCAP_ID(ecap) == capability, ("start extended capability is not expected capability")); ptr = PCI_EXTCAP_NEXTPTR(ecap); while (ptr != 0) { ecap = pci_read_config(child, ptr, 4); if (PCI_EXTCAP_ID(ecap) == capability) { if (capreg != NULL) *capreg = ptr; return (0); } ptr = PCI_EXTCAP_NEXTPTR(ecap); } return (ENOENT); } /* * Support for MSI-X message interrupts. */ static void pci_write_msix_entry(device_t dev, u_int index, uint64_t address, uint32_t data) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msix *msix = &dinfo->cfg.msix; uint32_t offset; KASSERT(msix->msix_table_len > index, ("bogus index")); offset = msix->msix_table_offset + index * 16; bus_write_4(msix->msix_table_res, offset, address & 0xffffffff); bus_write_4(msix->msix_table_res, offset + 4, address >> 32); bus_write_4(msix->msix_table_res, offset + 8, data); } void pci_enable_msix_method(device_t dev, device_t child, u_int index, uint64_t address, uint32_t data) { if (pci_msix_rewrite_table) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; /* * Some VM hosts require MSIX to be disabled in the * control register before updating the MSIX table * entries are allowed. It is not enough to only * disable MSIX while updating a single entry. MSIX * must be disabled while updating all entries in the * table. */ pci_write_config(child, msix->msix_location + PCIR_MSIX_CTRL, msix->msix_ctrl & ~PCIM_MSIXCTRL_MSIX_ENABLE, 2); pci_resume_msix(child); } else pci_write_msix_entry(child, index, address, data); /* Enable MSI -> HT mapping. */ pci_ht_map_msi(child, address); } void pci_mask_msix(device_t dev, u_int index) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msix *msix = &dinfo->cfg.msix; uint32_t offset, val; KASSERT(msix->msix_msgnum > index, ("bogus index")); offset = msix->msix_table_offset + index * 16 + 12; val = bus_read_4(msix->msix_table_res, offset); val |= PCIM_MSIX_VCTRL_MASK; /* * Some devices (e.g. Samsung PM961) do not support reads of this * register, so always write the new value. */ bus_write_4(msix->msix_table_res, offset, val); } void pci_unmask_msix(device_t dev, u_int index) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msix *msix = &dinfo->cfg.msix; uint32_t offset, val; KASSERT(msix->msix_table_len > index, ("bogus index")); offset = msix->msix_table_offset + index * 16 + 12; val = bus_read_4(msix->msix_table_res, offset); val &= ~PCIM_MSIX_VCTRL_MASK; /* * Some devices (e.g. Samsung PM961) do not support reads of this * register, so always write the new value. */ bus_write_4(msix->msix_table_res, offset, val); } int pci_pending_msix(device_t dev, u_int index) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msix *msix = &dinfo->cfg.msix; uint32_t offset, bit; KASSERT(msix->msix_table_len > index, ("bogus index")); offset = msix->msix_pba_offset + (index / 32) * 4; bit = 1 << index % 32; return (bus_read_4(msix->msix_pba_res, offset) & bit); } /* * Restore MSI-X registers and table during resume. If MSI-X is * enabled then walk the virtual table to restore the actual MSI-X * table. */ static void pci_resume_msix(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msix *msix = &dinfo->cfg.msix; struct msix_table_entry *mte; struct msix_vector *mv; int i; if (msix->msix_alloc > 0) { /* First, mask all vectors. */ for (i = 0; i < msix->msix_msgnum; i++) pci_mask_msix(dev, i); /* Second, program any messages with at least one handler. */ for (i = 0; i < msix->msix_table_len; i++) { mte = &msix->msix_table[i]; if (mte->mte_vector == 0 || mte->mte_handlers == 0) continue; mv = &msix->msix_vectors[mte->mte_vector - 1]; pci_write_msix_entry(dev, i, mv->mv_address, mv->mv_data); pci_unmask_msix(dev, i); } } pci_write_config(dev, msix->msix_location + PCIR_MSIX_CTRL, msix->msix_ctrl, 2); } /* * Attempt to allocate *count MSI-X messages. The actual number allocated is * returned in *count. After this function returns, each message will be * available to the driver as SYS_RES_IRQ resources starting at rid 1. */ int pci_alloc_msix_method(device_t dev, device_t child, int *count) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; struct resource_list_entry *rle; int actual, error, i, irq, max; /* Don't let count == 0 get us into trouble. */ if (*count == 0) return (EINVAL); /* If rid 0 is allocated, then fail. */ rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0); if (rle != NULL && rle->res != NULL) return (ENXIO); /* Already have allocated messages? */ if (cfg->msi.msi_alloc != 0 || cfg->msix.msix_alloc != 0) return (ENXIO); /* If MSI-X is blacklisted for this system, fail. */ if (pci_msix_blacklisted()) return (ENXIO); /* MSI-X capability present? */ if (cfg->msix.msix_location == 0 || !pci_do_msix) return (ENODEV); /* Make sure the appropriate BARs are mapped. */ rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, cfg->msix.msix_table_bar); if (rle == NULL || rle->res == NULL || !(rman_get_flags(rle->res) & RF_ACTIVE)) return (ENXIO); cfg->msix.msix_table_res = rle->res; if (cfg->msix.msix_pba_bar != cfg->msix.msix_table_bar) { rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, cfg->msix.msix_pba_bar); if (rle == NULL || rle->res == NULL || !(rman_get_flags(rle->res) & RF_ACTIVE)) return (ENXIO); } cfg->msix.msix_pba_res = rle->res; if (bootverbose) device_printf(child, "attempting to allocate %d MSI-X vectors (%d supported)\n", *count, cfg->msix.msix_msgnum); max = min(*count, cfg->msix.msix_msgnum); for (i = 0; i < max; i++) { /* Allocate a message. */ error = PCIB_ALLOC_MSIX(device_get_parent(dev), child, &irq); if (error) { if (i == 0) return (error); break; } resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq, irq, 1); } actual = i; if (bootverbose) { rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 1); if (actual == 1) device_printf(child, "using IRQ %ju for MSI-X\n", rle->start); else { int run; /* * Be fancy and try to print contiguous runs of * IRQ values as ranges. 'irq' is the previous IRQ. * 'run' is true if we are in a range. */ device_printf(child, "using IRQs %ju", rle->start); irq = rle->start; run = 0; for (i = 1; i < actual; i++) { rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); /* Still in a run? */ if (rle->start == irq + 1) { run = 1; irq++; continue; } /* Finish previous range. */ if (run) { printf("-%d", irq); run = 0; } /* Start new range. */ printf(",%ju", rle->start); irq = rle->start; } /* Unfinished range? */ if (run) printf("-%d", irq); printf(" for MSI-X\n"); } } /* Mask all vectors. */ for (i = 0; i < cfg->msix.msix_msgnum; i++) pci_mask_msix(child, i); /* Allocate and initialize vector data and virtual table. */ cfg->msix.msix_vectors = malloc(sizeof(struct msix_vector) * actual, M_DEVBUF, M_WAITOK | M_ZERO); cfg->msix.msix_table = malloc(sizeof(struct msix_table_entry) * actual, M_DEVBUF, M_WAITOK | M_ZERO); for (i = 0; i < actual; i++) { rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); cfg->msix.msix_vectors[i].mv_irq = rle->start; cfg->msix.msix_table[i].mte_vector = i + 1; } /* Update control register to enable MSI-X. */ cfg->msix.msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(child, cfg->msix.msix_location + PCIR_MSIX_CTRL, cfg->msix.msix_ctrl, 2); /* Update counts of alloc'd messages. */ cfg->msix.msix_alloc = actual; cfg->msix.msix_table_len = actual; *count = actual; return (0); } /* * By default, pci_alloc_msix() will assign the allocated IRQ * resources consecutively to the first N messages in the MSI-X table. * However, device drivers may want to use different layouts if they * either receive fewer messages than they asked for, or they wish to * populate the MSI-X table sparsely. This method allows the driver * to specify what layout it wants. It must be called after a * successful pci_alloc_msix() but before any of the associated * SYS_RES_IRQ resources are allocated via bus_alloc_resource(). * * The 'vectors' array contains 'count' message vectors. The array * maps directly to the MSI-X table in that index 0 in the array * specifies the vector for the first message in the MSI-X table, etc. * The vector value in each array index can either be 0 to indicate * that no vector should be assigned to a message slot, or it can be a * number from 1 to N (where N is the count returned from a * succcessful call to pci_alloc_msix()) to indicate which message * vector (IRQ) to be used for the corresponding message. * * On successful return, each message with a non-zero vector will have * an associated SYS_RES_IRQ whose rid is equal to the array index + * 1. Additionally, if any of the IRQs allocated via the previous * call to pci_alloc_msix() are not used in the mapping, those IRQs * will be freed back to the system automatically. * * For example, suppose a driver has a MSI-X table with 6 messages and * asks for 6 messages, but pci_alloc_msix() only returns a count of * 3. Call the three vectors allocated by pci_alloc_msix() A, B, and * C. After the call to pci_alloc_msix(), the device will be setup to * have an MSI-X table of ABC--- (where - means no vector assigned). * If the driver then passes a vector array of { 1, 0, 1, 2, 0, 2 }, * then the MSI-X table will look like A-AB-B, and the 'C' vector will * be freed back to the system. This device will also have valid * SYS_RES_IRQ rids of 1, 3, 4, and 6. * * In any case, the SYS_RES_IRQ rid X will always map to the message * at MSI-X table index X - 1 and will only be valid if a vector is * assigned to that table entry. */ int pci_remap_msix_method(device_t dev, device_t child, int count, const u_int *vectors) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; struct resource_list_entry *rle; int i, irq, j, *used; /* * Have to have at least one message in the table but the * table can't be bigger than the actual MSI-X table in the * device. */ if (count == 0 || count > msix->msix_msgnum) return (EINVAL); /* Sanity check the vectors. */ for (i = 0; i < count; i++) if (vectors[i] > msix->msix_alloc) return (EINVAL); /* * Make sure there aren't any holes in the vectors to be used. * It's a big pain to support it, and it doesn't really make * sense anyway. Also, at least one vector must be used. */ used = malloc(sizeof(int) * msix->msix_alloc, M_DEVBUF, M_WAITOK | M_ZERO); for (i = 0; i < count; i++) if (vectors[i] != 0) used[vectors[i] - 1] = 1; for (i = 0; i < msix->msix_alloc - 1; i++) if (used[i] == 0 && used[i + 1] == 1) { free(used, M_DEVBUF); return (EINVAL); } if (used[0] != 1) { free(used, M_DEVBUF); return (EINVAL); } /* Make sure none of the resources are allocated. */ for (i = 0; i < msix->msix_table_len; i++) { if (msix->msix_table[i].mte_vector == 0) continue; if (msix->msix_table[i].mte_handlers > 0) { free(used, M_DEVBUF); return (EBUSY); } rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); KASSERT(rle != NULL, ("missing resource")); if (rle->res != NULL) { free(used, M_DEVBUF); return (EBUSY); } } /* Free the existing resource list entries. */ for (i = 0; i < msix->msix_table_len; i++) { if (msix->msix_table[i].mte_vector == 0) continue; resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1); } /* * Build the new virtual table keeping track of which vectors are * used. */ free(msix->msix_table, M_DEVBUF); msix->msix_table = malloc(sizeof(struct msix_table_entry) * count, M_DEVBUF, M_WAITOK | M_ZERO); for (i = 0; i < count; i++) msix->msix_table[i].mte_vector = vectors[i]; msix->msix_table_len = count; /* Free any unused IRQs and resize the vectors array if necessary. */ j = msix->msix_alloc - 1; if (used[j] == 0) { struct msix_vector *vec; while (used[j] == 0) { PCIB_RELEASE_MSIX(device_get_parent(dev), child, msix->msix_vectors[j].mv_irq); j--; } vec = malloc(sizeof(struct msix_vector) * (j + 1), M_DEVBUF, M_WAITOK); bcopy(msix->msix_vectors, vec, sizeof(struct msix_vector) * (j + 1)); free(msix->msix_vectors, M_DEVBUF); msix->msix_vectors = vec; msix->msix_alloc = j + 1; } free(used, M_DEVBUF); /* Map the IRQs onto the rids. */ for (i = 0; i < count; i++) { if (vectors[i] == 0) continue; irq = msix->msix_vectors[vectors[i] - 1].mv_irq; resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irq, irq, 1); } if (bootverbose) { device_printf(child, "Remapped MSI-X IRQs as: "); for (i = 0; i < count; i++) { if (i != 0) printf(", "); if (vectors[i] == 0) printf("---"); else printf("%d", msix->msix_vectors[vectors[i] - 1].mv_irq); } printf("\n"); } return (0); } static int pci_release_msix(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; struct resource_list_entry *rle; int i; /* Do we have any messages to release? */ if (msix->msix_alloc == 0) return (ENODEV); /* Make sure none of the resources are allocated. */ for (i = 0; i < msix->msix_table_len; i++) { if (msix->msix_table[i].mte_vector == 0) continue; if (msix->msix_table[i].mte_handlers > 0) return (EBUSY); rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); KASSERT(rle != NULL, ("missing resource")); if (rle->res != NULL) return (EBUSY); } /* Update control register to disable MSI-X. */ msix->msix_ctrl &= ~PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(child, msix->msix_location + PCIR_MSIX_CTRL, msix->msix_ctrl, 2); /* Free the resource list entries. */ for (i = 0; i < msix->msix_table_len; i++) { if (msix->msix_table[i].mte_vector == 0) continue; resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1); } free(msix->msix_table, M_DEVBUF); msix->msix_table_len = 0; /* Release the IRQs. */ for (i = 0; i < msix->msix_alloc; i++) PCIB_RELEASE_MSIX(device_get_parent(dev), child, msix->msix_vectors[i].mv_irq); free(msix->msix_vectors, M_DEVBUF); msix->msix_alloc = 0; return (0); } /* * Return the max supported MSI-X messages this device supports. * Basically, assuming the MD code can alloc messages, this function * should return the maximum value that pci_alloc_msix() can return. * Thus, it is subject to the tunables, etc. */ int pci_msix_count_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; if (pci_do_msix && msix->msix_location != 0) return (msix->msix_msgnum); return (0); } int pci_msix_pba_bar_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; if (pci_do_msix && msix->msix_location != 0) return (msix->msix_pba_bar); return (-1); } int pci_msix_table_bar_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msix *msix = &dinfo->cfg.msix; if (pci_do_msix && msix->msix_location != 0) return (msix->msix_table_bar); return (-1); } /* * HyperTransport MSI mapping control */ void pci_ht_map_msi(device_t dev, uint64_t addr) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_ht *ht = &dinfo->cfg.ht; if (!ht->ht_msimap) return; if (addr && !(ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) && ht->ht_msiaddr >> 20 == addr >> 20) { /* Enable MSI -> HT mapping. */ ht->ht_msictrl |= PCIM_HTCMD_MSI_ENABLE; pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND, ht->ht_msictrl, 2); } if (!addr && ht->ht_msictrl & PCIM_HTCMD_MSI_ENABLE) { /* Disable MSI -> HT mapping. */ ht->ht_msictrl &= ~PCIM_HTCMD_MSI_ENABLE; pci_write_config(dev, ht->ht_msimap + PCIR_HT_COMMAND, ht->ht_msictrl, 2); } } int pci_get_relaxed_ordering_enabled(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; uint16_t val; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (0); val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2); val &= PCIEM_CTL_RELAXED_ORD_ENABLE; return (val != 0); } int pci_get_max_payload(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; uint16_t val; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (0); val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2); val &= PCIEM_CTL_MAX_PAYLOAD; val >>= 5; return (1 << (val + 7)); } int pci_get_max_read_req(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; uint16_t val; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (0); val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2); val &= PCIEM_CTL_MAX_READ_REQUEST; val >>= 12; return (1 << (val + 7)); } int pci_set_max_read_req(device_t dev, int size) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; uint16_t val; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (0); if (size < 128) size = 128; if (size > 4096) size = 4096; size = (1 << (fls(size) - 1)); val = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2); val &= ~PCIEM_CTL_MAX_READ_REQUEST; val |= (fls(size) - 8) << 12; pci_write_config(dev, cap + PCIER_DEVICE_CTL, val, 2); return (size); } uint32_t pcie_read_config(device_t dev, int reg, int width) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) { if (width == 2) return (0xffff); return (0xffffffff); } return (pci_read_config(dev, cap + reg, width)); } void pcie_write_config(device_t dev, int reg, uint32_t value, int width) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return; pci_write_config(dev, cap + reg, value, width); } /* * Adjusts a PCI-e capability register by clearing the bits in mask * and setting the bits in (value & mask). Bits not set in mask are * not adjusted. * * Returns the old value on success or all ones on failure. */ uint32_t pcie_adjust_config(device_t dev, int reg, uint32_t mask, uint32_t value, int width) { struct pci_devinfo *dinfo = device_get_ivars(dev); uint32_t old, new; int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) { if (width == 2) return (0xffff); return (0xffffffff); } old = pci_read_config(dev, cap + reg, width); new = old & ~mask; new |= (value & mask); pci_write_config(dev, cap + reg, new, width); return (old); } /* * Support for MSI message signalled interrupts. */ void pci_enable_msi_method(device_t dev, device_t child, uint64_t address, uint16_t data) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msi *msi = &dinfo->cfg.msi; /* Write data and address values. */ pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR, address & 0xffffffff, 4); if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) { pci_write_config(child, msi->msi_location + PCIR_MSI_ADDR_HIGH, address >> 32, 4); pci_write_config(child, msi->msi_location + PCIR_MSI_DATA_64BIT, data, 2); } else pci_write_config(child, msi->msi_location + PCIR_MSI_DATA, data, 2); /* Enable MSI in the control register. */ msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE; pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl, 2); /* Enable MSI -> HT mapping. */ pci_ht_map_msi(child, address); } void pci_disable_msi_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msi *msi = &dinfo->cfg.msi; /* Disable MSI -> HT mapping. */ pci_ht_map_msi(child, 0); /* Disable MSI in the control register. */ msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE; pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl, 2); } /* * Restore MSI registers during resume. If MSI is enabled then * restore the data and address registers in addition to the control * register. */ static void pci_resume_msi(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); struct pcicfg_msi *msi = &dinfo->cfg.msi; uint64_t address; uint16_t data; if (msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE) { address = msi->msi_addr; data = msi->msi_data; pci_write_config(dev, msi->msi_location + PCIR_MSI_ADDR, address & 0xffffffff, 4); if (msi->msi_ctrl & PCIM_MSICTRL_64BIT) { pci_write_config(dev, msi->msi_location + PCIR_MSI_ADDR_HIGH, address >> 32, 4); pci_write_config(dev, msi->msi_location + PCIR_MSI_DATA_64BIT, data, 2); } else pci_write_config(dev, msi->msi_location + PCIR_MSI_DATA, data, 2); } pci_write_config(dev, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl, 2); } static int pci_remap_intr_method(device_t bus, device_t dev, u_int irq) { struct pci_devinfo *dinfo = device_get_ivars(dev); pcicfgregs *cfg = &dinfo->cfg; struct resource_list_entry *rle; struct msix_table_entry *mte; struct msix_vector *mv; uint64_t addr; uint32_t data; int error, i, j; /* * Handle MSI first. We try to find this IRQ among our list * of MSI IRQs. If we find it, we request updated address and * data registers and apply the results. */ if (cfg->msi.msi_alloc > 0) { /* If we don't have any active handlers, nothing to do. */ if (cfg->msi.msi_handlers == 0) return (0); for (i = 0; i < cfg->msi.msi_alloc; i++) { rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); if (rle->start == irq) { error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, &addr, &data); if (error) return (error); pci_disable_msi(dev); dinfo->cfg.msi.msi_addr = addr; dinfo->cfg.msi.msi_data = data; pci_enable_msi(dev, addr, data); return (0); } } return (ENOENT); } /* * For MSI-X, we check to see if we have this IRQ. If we do, * we request the updated mapping info. If that works, we go * through all the slots that use this IRQ and update them. */ if (cfg->msix.msix_alloc > 0) { for (i = 0; i < cfg->msix.msix_alloc; i++) { mv = &cfg->msix.msix_vectors[i]; if (mv->mv_irq == irq) { error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, &addr, &data); if (error) return (error); mv->mv_address = addr; mv->mv_data = data; for (j = 0; j < cfg->msix.msix_table_len; j++) { mte = &cfg->msix.msix_table[j]; if (mte->mte_vector != i + 1) continue; if (mte->mte_handlers == 0) continue; pci_mask_msix(dev, j); pci_enable_msix(dev, j, addr, data); pci_unmask_msix(dev, j); } } } return (ENOENT); } return (ENOENT); } /* * Returns true if the specified device is blacklisted because MSI * doesn't work. */ int pci_msi_device_blacklisted(device_t dev) { if (!pci_honor_msi_blacklist) return (0); return (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSI)); } /* * Determine if MSI is blacklisted globally on this system. Currently, * we just check for blacklisted chipsets as represented by the * host-PCI bridge at device 0:0:0. In the future, it may become * necessary to check other system attributes, such as the kenv values * that give the motherboard manufacturer and model number. */ static int pci_msi_blacklisted(void) { device_t dev; if (!pci_honor_msi_blacklist) return (0); /* Blacklist all non-PCI-express and non-PCI-X chipsets. */ if (!(pcie_chipset || pcix_chipset)) { if (vm_guest != VM_GUEST_NO) { /* * Whitelist older chipsets in virtual * machines known to support MSI. */ dev = pci_find_bsf(0, 0, 0); if (dev != NULL) return (!pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_ENABLE_MSI_VM)); } return (1); } dev = pci_find_bsf(0, 0, 0); if (dev != NULL) return (pci_msi_device_blacklisted(dev)); return (0); } /* * Returns true if the specified device is blacklisted because MSI-X * doesn't work. Note that this assumes that if MSI doesn't work, * MSI-X doesn't either. */ int pci_msix_device_blacklisted(device_t dev) { if (!pci_honor_msi_blacklist) return (0); if (pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSIX)) return (1); return (pci_msi_device_blacklisted(dev)); } /* * Determine if MSI-X is blacklisted globally on this system. If MSI * is blacklisted, assume that MSI-X is as well. Check for additional * chipsets where MSI works but MSI-X does not. */ static int pci_msix_blacklisted(void) { device_t dev; if (!pci_honor_msi_blacklist) return (0); dev = pci_find_bsf(0, 0, 0); if (dev != NULL && pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_DISABLE_MSIX)) return (1); return (pci_msi_blacklisted()); } /* * Attempt to allocate *count MSI messages. The actual number allocated is * returned in *count. After this function returns, each message will be * available to the driver as SYS_RES_IRQ resources starting at a rid 1. */ int pci_alloc_msi_method(device_t dev, device_t child, int *count) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; struct resource_list_entry *rle; int actual, error, i, irqs[32]; uint16_t ctrl; /* Don't let count == 0 get us into trouble. */ if (*count == 0) return (EINVAL); /* If rid 0 is allocated, then fail. */ rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0); if (rle != NULL && rle->res != NULL) return (ENXIO); /* Already have allocated messages? */ if (cfg->msi.msi_alloc != 0 || cfg->msix.msix_alloc != 0) return (ENXIO); /* If MSI is blacklisted for this system, fail. */ if (pci_msi_blacklisted()) return (ENXIO); /* MSI capability present? */ if (cfg->msi.msi_location == 0 || !pci_do_msi) return (ENODEV); if (bootverbose) device_printf(child, "attempting to allocate %d MSI vectors (%d supported)\n", *count, cfg->msi.msi_msgnum); /* Don't ask for more than the device supports. */ actual = min(*count, cfg->msi.msi_msgnum); /* Don't ask for more than 32 messages. */ actual = min(actual, 32); /* MSI requires power of 2 number of messages. */ if (!powerof2(actual)) return (EINVAL); for (;;) { /* Try to allocate N messages. */ error = PCIB_ALLOC_MSI(device_get_parent(dev), child, actual, actual, irqs); if (error == 0) break; if (actual == 1) return (error); /* Try N / 2. */ actual >>= 1; } /* * We now have N actual messages mapped onto SYS_RES_IRQ * resources in the irqs[] array, so add new resources * starting at rid 1. */ for (i = 0; i < actual; i++) resource_list_add(&dinfo->resources, SYS_RES_IRQ, i + 1, irqs[i], irqs[i], 1); if (bootverbose) { if (actual == 1) device_printf(child, "using IRQ %d for MSI\n", irqs[0]); else { int run; /* * Be fancy and try to print contiguous runs * of IRQ values as ranges. 'run' is true if * we are in a range. */ device_printf(child, "using IRQs %d", irqs[0]); run = 0; for (i = 1; i < actual; i++) { /* Still in a run? */ if (irqs[i] == irqs[i - 1] + 1) { run = 1; continue; } /* Finish previous range. */ if (run) { printf("-%d", irqs[i - 1]); run = 0; } /* Start new range. */ printf(",%d", irqs[i]); } /* Unfinished range? */ if (run) printf("-%d", irqs[actual - 1]); printf(" for MSI\n"); } } /* Update control register with actual count. */ ctrl = cfg->msi.msi_ctrl; ctrl &= ~PCIM_MSICTRL_MME_MASK; ctrl |= (ffs(actual) - 1) << 4; cfg->msi.msi_ctrl = ctrl; pci_write_config(child, cfg->msi.msi_location + PCIR_MSI_CTRL, ctrl, 2); /* Update counts of alloc'd messages. */ cfg->msi.msi_alloc = actual; cfg->msi.msi_handlers = 0; *count = actual; return (0); } /* Release the MSI messages associated with this device. */ int pci_release_msi_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msi *msi = &dinfo->cfg.msi; struct resource_list_entry *rle; int error, i, irqs[32]; /* Try MSI-X first. */ error = pci_release_msix(dev, child); if (error != ENODEV) return (error); /* Do we have any messages to release? */ if (msi->msi_alloc == 0) return (ENODEV); KASSERT(msi->msi_alloc <= 32, ("more than 32 alloc'd messages")); /* Make sure none of the resources are allocated. */ if (msi->msi_handlers > 0) return (EBUSY); for (i = 0; i < msi->msi_alloc; i++) { rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, i + 1); KASSERT(rle != NULL, ("missing MSI resource")); if (rle->res != NULL) return (EBUSY); irqs[i] = rle->start; } /* Update control register with 0 count. */ KASSERT(!(msi->msi_ctrl & PCIM_MSICTRL_MSI_ENABLE), ("%s: MSI still enabled", __func__)); msi->msi_ctrl &= ~PCIM_MSICTRL_MME_MASK; pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL, msi->msi_ctrl, 2); /* Release the messages. */ PCIB_RELEASE_MSI(device_get_parent(dev), child, msi->msi_alloc, irqs); for (i = 0; i < msi->msi_alloc; i++) resource_list_delete(&dinfo->resources, SYS_RES_IRQ, i + 1); /* Update alloc count. */ msi->msi_alloc = 0; msi->msi_addr = 0; msi->msi_data = 0; return (0); } /* * Return the max supported MSI messages this device supports. * Basically, assuming the MD code can alloc messages, this function * should return the maximum value that pci_alloc_msi() can return. * Thus, it is subject to the tunables, etc. */ int pci_msi_count_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); struct pcicfg_msi *msi = &dinfo->cfg.msi; if (pci_do_msi && msi->msi_location != 0) return (msi->msi_msgnum); return (0); } /* free pcicfgregs structure and all depending data structures */ int pci_freecfg(struct pci_devinfo *dinfo) { struct devlist *devlist_head; struct pci_map *pm, *next; int i; devlist_head = &pci_devq; if (dinfo->cfg.vpd.vpd_reg) { free(dinfo->cfg.vpd.vpd_ident, M_DEVBUF); for (i = 0; i < dinfo->cfg.vpd.vpd_rocnt; i++) free(dinfo->cfg.vpd.vpd_ros[i].value, M_DEVBUF); free(dinfo->cfg.vpd.vpd_ros, M_DEVBUF); for (i = 0; i < dinfo->cfg.vpd.vpd_wcnt; i++) free(dinfo->cfg.vpd.vpd_w[i].value, M_DEVBUF); free(dinfo->cfg.vpd.vpd_w, M_DEVBUF); } STAILQ_FOREACH_SAFE(pm, &dinfo->cfg.maps, pm_link, next) { free(pm, M_DEVBUF); } STAILQ_REMOVE(devlist_head, dinfo, pci_devinfo, pci_links); free(dinfo, M_DEVBUF); /* increment the generation count */ pci_generation++; /* we're losing one device */ pci_numdevs--; return (0); } /* * PCI power manangement */ int pci_set_powerstate_method(device_t dev, device_t child, int state) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; uint16_t status; int oldstate, highest, delay; if (cfg->pp.pp_cap == 0) return (EOPNOTSUPP); /* * Optimize a no state change request away. While it would be OK to * write to the hardware in theory, some devices have shown odd * behavior when going from D3 -> D3. */ oldstate = pci_get_powerstate(child); if (oldstate == state) return (0); /* * The PCI power management specification states that after a state * transition between PCI power states, system software must * guarantee a minimal delay before the function accesses the device. * Compute the worst case delay that we need to guarantee before we * access the device. Many devices will be responsive much more * quickly than this delay, but there are some that don't respond * instantly to state changes. Transitions to/from D3 state require * 10ms, while D2 requires 200us, and D0/1 require none. The delay * is done below with DELAY rather than a sleeper function because * this function can be called from contexts where we cannot sleep. */ highest = (oldstate > state) ? oldstate : state; if (highest == PCI_POWERSTATE_D3) delay = 10000; else if (highest == PCI_POWERSTATE_D2) delay = 200; else delay = 0; status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2) & ~PCIM_PSTAT_DMASK; switch (state) { case PCI_POWERSTATE_D0: status |= PCIM_PSTAT_D0; break; case PCI_POWERSTATE_D1: if ((cfg->pp.pp_cap & PCIM_PCAP_D1SUPP) == 0) return (EOPNOTSUPP); status |= PCIM_PSTAT_D1; break; case PCI_POWERSTATE_D2: if ((cfg->pp.pp_cap & PCIM_PCAP_D2SUPP) == 0) return (EOPNOTSUPP); status |= PCIM_PSTAT_D2; break; case PCI_POWERSTATE_D3: status |= PCIM_PSTAT_D3; break; default: return (EINVAL); } if (bootverbose) pci_printf(cfg, "Transition from D%d to D%d\n", oldstate, state); PCI_WRITE_CONFIG(dev, child, cfg->pp.pp_status, status, 2); if (delay) DELAY(delay); return (0); } int pci_get_powerstate_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; uint16_t status; int result; if (cfg->pp.pp_cap != 0) { status = PCI_READ_CONFIG(dev, child, cfg->pp.pp_status, 2); switch (status & PCIM_PSTAT_DMASK) { case PCIM_PSTAT_D0: result = PCI_POWERSTATE_D0; break; case PCIM_PSTAT_D1: result = PCI_POWERSTATE_D1; break; case PCIM_PSTAT_D2: result = PCI_POWERSTATE_D2; break; case PCIM_PSTAT_D3: result = PCI_POWERSTATE_D3; break; default: result = PCI_POWERSTATE_UNKNOWN; break; } } else { /* No support, device is always at D0 */ result = PCI_POWERSTATE_D0; } return (result); } /* * Some convenience functions for PCI device drivers. */ static __inline void pci_set_command_bit(device_t dev, device_t child, uint16_t bit) { uint16_t command; command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2); command |= bit; PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2); } static __inline void pci_clear_command_bit(device_t dev, device_t child, uint16_t bit) { uint16_t command; command = PCI_READ_CONFIG(dev, child, PCIR_COMMAND, 2); command &= ~bit; PCI_WRITE_CONFIG(dev, child, PCIR_COMMAND, command, 2); } int pci_enable_busmaster_method(device_t dev, device_t child) { pci_set_command_bit(dev, child, PCIM_CMD_BUSMASTEREN); return (0); } int pci_disable_busmaster_method(device_t dev, device_t child) { pci_clear_command_bit(dev, child, PCIM_CMD_BUSMASTEREN); return (0); } int pci_enable_io_method(device_t dev, device_t child, int space) { uint16_t bit; switch(space) { case SYS_RES_IOPORT: bit = PCIM_CMD_PORTEN; break; case SYS_RES_MEMORY: bit = PCIM_CMD_MEMEN; break; default: return (EINVAL); } pci_set_command_bit(dev, child, bit); return (0); } int pci_disable_io_method(device_t dev, device_t child, int space) { uint16_t bit; switch(space) { case SYS_RES_IOPORT: bit = PCIM_CMD_PORTEN; break; case SYS_RES_MEMORY: bit = PCIM_CMD_MEMEN; break; default: return (EINVAL); } pci_clear_command_bit(dev, child, bit); return (0); } /* * New style pci driver. Parent device is either a pci-host-bridge or a * pci-pci-bridge. Both kinds are represented by instances of pcib. */ void pci_print_verbose(struct pci_devinfo *dinfo) { if (bootverbose) { pcicfgregs *cfg = &dinfo->cfg; printf("found->\tvendor=0x%04x, dev=0x%04x, revid=0x%02x\n", cfg->vendor, cfg->device, cfg->revid); printf("\tdomain=%d, bus=%d, slot=%d, func=%d\n", cfg->domain, cfg->bus, cfg->slot, cfg->func); printf("\tclass=%02x-%02x-%02x, hdrtype=0x%02x, mfdev=%d\n", cfg->baseclass, cfg->subclass, cfg->progif, cfg->hdrtype, cfg->mfdev); printf("\tcmdreg=0x%04x, statreg=0x%04x, cachelnsz=%d (dwords)\n", cfg->cmdreg, cfg->statreg, cfg->cachelnsz); printf("\tlattimer=0x%02x (%d ns), mingnt=0x%02x (%d ns), maxlat=0x%02x (%d ns)\n", cfg->lattimer, cfg->lattimer * 30, cfg->mingnt, cfg->mingnt * 250, cfg->maxlat, cfg->maxlat * 250); if (cfg->intpin > 0) printf("\tintpin=%c, irq=%d\n", cfg->intpin +'a' -1, cfg->intline); if (cfg->pp.pp_cap) { uint16_t status; status = pci_read_config(cfg->dev, cfg->pp.pp_status, 2); printf("\tpowerspec %d supports D0%s%s D3 current D%d\n", cfg->pp.pp_cap & PCIM_PCAP_SPEC, cfg->pp.pp_cap & PCIM_PCAP_D1SUPP ? " D1" : "", cfg->pp.pp_cap & PCIM_PCAP_D2SUPP ? " D2" : "", status & PCIM_PSTAT_DMASK); } if (cfg->msi.msi_location) { int ctrl; ctrl = cfg->msi.msi_ctrl; printf("\tMSI supports %d message%s%s%s\n", cfg->msi.msi_msgnum, (cfg->msi.msi_msgnum == 1) ? "" : "s", (ctrl & PCIM_MSICTRL_64BIT) ? ", 64 bit" : "", (ctrl & PCIM_MSICTRL_VECTOR) ? ", vector masks":""); } if (cfg->msix.msix_location) { printf("\tMSI-X supports %d message%s ", cfg->msix.msix_msgnum, (cfg->msix.msix_msgnum == 1) ? "" : "s"); if (cfg->msix.msix_table_bar == cfg->msix.msix_pba_bar) printf("in map 0x%x\n", cfg->msix.msix_table_bar); else printf("in maps 0x%x and 0x%x\n", cfg->msix.msix_table_bar, cfg->msix.msix_pba_bar); } } } static int pci_porten(device_t dev) { return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_PORTEN) != 0; } static int pci_memen(device_t dev) { return (pci_read_config(dev, PCIR_COMMAND, 2) & PCIM_CMD_MEMEN) != 0; } void pci_read_bar(device_t dev, int reg, pci_addr_t *mapp, pci_addr_t *testvalp, int *bar64) { struct pci_devinfo *dinfo; pci_addr_t map, testval; int ln2range; uint16_t cmd; /* * The device ROM BAR is special. It is always a 32-bit * memory BAR. Bit 0 is special and should not be set when * sizing the BAR. */ dinfo = device_get_ivars(dev); if (PCIR_IS_BIOS(&dinfo->cfg, reg)) { map = pci_read_config(dev, reg, 4); pci_write_config(dev, reg, 0xfffffffe, 4); testval = pci_read_config(dev, reg, 4); pci_write_config(dev, reg, map, 4); *mapp = map; *testvalp = testval; if (bar64 != NULL) *bar64 = 0; return; } map = pci_read_config(dev, reg, 4); ln2range = pci_maprange(map); if (ln2range == 64) map |= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32; /* * Disable decoding via the command register before * determining the BAR's length since we will be placing it in * a weird state. */ cmd = pci_read_config(dev, PCIR_COMMAND, 2); pci_write_config(dev, PCIR_COMMAND, cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2); /* * Determine the BAR's length by writing all 1's. The bottom * log_2(size) bits of the BAR will stick as 0 when we read * the value back. * * NB: according to the PCI Local Bus Specification, rev. 3.0: * "Software writes 0FFFFFFFFh to both registers, reads them back, * and combines the result into a 64-bit value." (section 6.2.5.1) * * Writes to both registers must be performed before attempting to * read back the size value. */ testval = 0; pci_write_config(dev, reg, 0xffffffff, 4); if (ln2range == 64) { pci_write_config(dev, reg + 4, 0xffffffff, 4); testval |= (pci_addr_t)pci_read_config(dev, reg + 4, 4) << 32; } testval |= pci_read_config(dev, reg, 4); /* * Restore the original value of the BAR. We may have reprogrammed * the BAR of the low-level console device and when booting verbose, * we need the console device addressable. */ pci_write_config(dev, reg, map, 4); if (ln2range == 64) pci_write_config(dev, reg + 4, map >> 32, 4); pci_write_config(dev, PCIR_COMMAND, cmd, 2); *mapp = map; *testvalp = testval; if (bar64 != NULL) *bar64 = (ln2range == 64); } static void pci_write_bar(device_t dev, struct pci_map *pm, pci_addr_t base) { struct pci_devinfo *dinfo; int ln2range; /* The device ROM BAR is always a 32-bit memory BAR. */ dinfo = device_get_ivars(dev); if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg)) ln2range = 32; else ln2range = pci_maprange(pm->pm_value); pci_write_config(dev, pm->pm_reg, base, 4); if (ln2range == 64) pci_write_config(dev, pm->pm_reg + 4, base >> 32, 4); pm->pm_value = pci_read_config(dev, pm->pm_reg, 4); if (ln2range == 64) pm->pm_value |= (pci_addr_t)pci_read_config(dev, pm->pm_reg + 4, 4) << 32; } struct pci_map * pci_find_bar(device_t dev, int reg) { struct pci_devinfo *dinfo; struct pci_map *pm; dinfo = device_get_ivars(dev); STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) { if (pm->pm_reg == reg) return (pm); } return (NULL); } int pci_bar_enabled(device_t dev, struct pci_map *pm) { struct pci_devinfo *dinfo; uint16_t cmd; dinfo = device_get_ivars(dev); if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) && !(pm->pm_value & PCIM_BIOS_ENABLE)) return (0); #ifdef PCI_IOV if ((dinfo->cfg.flags & PCICFG_VF) != 0) { struct pcicfg_iov *iov; iov = dinfo->cfg.iov; cmd = pci_read_config(iov->iov_pf, iov->iov_pos + PCIR_SRIOV_CTL, 2); return ((cmd & PCIM_SRIOV_VF_MSE) != 0); } #endif cmd = pci_read_config(dev, PCIR_COMMAND, 2); if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg) || PCI_BAR_MEM(pm->pm_value)) return ((cmd & PCIM_CMD_MEMEN) != 0); else return ((cmd & PCIM_CMD_PORTEN) != 0); } struct pci_map * pci_add_bar(device_t dev, int reg, pci_addr_t value, pci_addr_t size) { struct pci_devinfo *dinfo; struct pci_map *pm, *prev; dinfo = device_get_ivars(dev); pm = malloc(sizeof(*pm), M_DEVBUF, M_WAITOK | M_ZERO); pm->pm_reg = reg; pm->pm_value = value; pm->pm_size = size; STAILQ_FOREACH(prev, &dinfo->cfg.maps, pm_link) { KASSERT(prev->pm_reg != pm->pm_reg, ("duplicate map %02x", reg)); if (STAILQ_NEXT(prev, pm_link) == NULL || STAILQ_NEXT(prev, pm_link)->pm_reg > pm->pm_reg) break; } if (prev != NULL) STAILQ_INSERT_AFTER(&dinfo->cfg.maps, prev, pm, pm_link); else STAILQ_INSERT_TAIL(&dinfo->cfg.maps, pm, pm_link); return (pm); } static void pci_restore_bars(device_t dev) { struct pci_devinfo *dinfo; struct pci_map *pm; int ln2range; dinfo = device_get_ivars(dev); STAILQ_FOREACH(pm, &dinfo->cfg.maps, pm_link) { if (PCIR_IS_BIOS(&dinfo->cfg, pm->pm_reg)) ln2range = 32; else ln2range = pci_maprange(pm->pm_value); pci_write_config(dev, pm->pm_reg, pm->pm_value, 4); if (ln2range == 64) pci_write_config(dev, pm->pm_reg + 4, pm->pm_value >> 32, 4); } } /* * Add a resource based on a pci map register. Return 1 if the map * register is a 32bit map register or 2 if it is a 64bit register. */ static int pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl, int force, int prefetch) { struct pci_map *pm; pci_addr_t base, map, testval; pci_addr_t start, end, count; int barlen, basezero, flags, maprange, mapsize, type; uint16_t cmd; struct resource *res; /* * The BAR may already exist if the device is a CardBus card * whose CIS is stored in this BAR. */ pm = pci_find_bar(dev, reg); if (pm != NULL) { maprange = pci_maprange(pm->pm_value); barlen = maprange == 64 ? 2 : 1; return (barlen); } pci_read_bar(dev, reg, &map, &testval, NULL); if (PCI_BAR_MEM(map)) { type = SYS_RES_MEMORY; if (map & PCIM_BAR_MEM_PREFETCH) prefetch = 1; } else type = SYS_RES_IOPORT; mapsize = pci_mapsize(testval); base = pci_mapbase(map); #ifdef __PCI_BAR_ZERO_VALID basezero = 0; #else basezero = base == 0; #endif maprange = pci_maprange(map); barlen = maprange == 64 ? 2 : 1; /* * For I/O registers, if bottom bit is set, and the next bit up * isn't clear, we know we have a BAR that doesn't conform to the * spec, so ignore it. Also, sanity check the size of the data * areas to the type of memory involved. Memory must be at least * 16 bytes in size, while I/O ranges must be at least 4. */ if (PCI_BAR_IO(testval) && (testval & PCIM_BAR_IO_RESERVED) != 0) return (barlen); if ((type == SYS_RES_MEMORY && mapsize < 4) || (type == SYS_RES_IOPORT && mapsize < 2)) return (barlen); /* Save a record of this BAR. */ pm = pci_add_bar(dev, reg, map, mapsize); if (bootverbose) { printf("\tmap[%02x]: type %s, range %2d, base %#jx, size %2d", reg, pci_maptype(map), maprange, (uintmax_t)base, mapsize); if (type == SYS_RES_IOPORT && !pci_porten(dev)) printf(", port disabled\n"); else if (type == SYS_RES_MEMORY && !pci_memen(dev)) printf(", memory disabled\n"); else printf(", enabled\n"); } /* * If base is 0, then we have problems if this architecture does * not allow that. It is best to ignore such entries for the * moment. These will be allocated later if the driver specifically * requests them. However, some removable buses look better when * all resources are allocated, so allow '0' to be overriden. * * Similarly treat maps whose values is the same as the test value * read back. These maps have had all f's written to them by the * BIOS in an attempt to disable the resources. */ if (!force && (basezero || map == testval)) return (barlen); if ((u_long)base != base) { device_printf(bus, "pci%d:%d:%d:%d bar %#x too many address bits", pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev), reg); return (barlen); } /* * This code theoretically does the right thing, but has * undesirable side effects in some cases where peripherals * respond oddly to having these bits enabled. Let the user * be able to turn them off (since pci_enable_io_modes is 1 by * default). */ if (pci_enable_io_modes) { /* Turn on resources that have been left off by a lazy BIOS */ if (type == SYS_RES_IOPORT && !pci_porten(dev)) { cmd = pci_read_config(dev, PCIR_COMMAND, 2); cmd |= PCIM_CMD_PORTEN; pci_write_config(dev, PCIR_COMMAND, cmd, 2); } if (type == SYS_RES_MEMORY && !pci_memen(dev)) { cmd = pci_read_config(dev, PCIR_COMMAND, 2); cmd |= PCIM_CMD_MEMEN; pci_write_config(dev, PCIR_COMMAND, cmd, 2); } } else { if (type == SYS_RES_IOPORT && !pci_porten(dev)) return (barlen); if (type == SYS_RES_MEMORY && !pci_memen(dev)) return (barlen); } count = (pci_addr_t)1 << mapsize; flags = RF_ALIGNMENT_LOG2(mapsize); if (prefetch) flags |= RF_PREFETCHABLE; if (basezero || base == pci_mapbase(testval) || pci_clear_bars) { start = 0; /* Let the parent decide. */ end = ~0; } else { start = base; end = base + count - 1; } resource_list_add(rl, type, reg, start, end, count); /* * Try to allocate the resource for this BAR from our parent * so that this resource range is already reserved. The * driver for this device will later inherit this resource in * pci_alloc_resource(). */ res = resource_list_reserve(rl, bus, dev, type, ®, start, end, count, flags); if ((pci_do_realloc_bars || pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_REALLOC_BAR)) && res == NULL && (start != 0 || end != ~0)) { /* * If the allocation fails, try to allocate a resource for * this BAR using any available range. The firmware felt * it was important enough to assign a resource, so don't * disable decoding if we can help it. */ resource_list_delete(rl, type, reg); resource_list_add(rl, type, reg, 0, ~0, count); res = resource_list_reserve(rl, bus, dev, type, ®, 0, ~0, count, flags); } if (res == NULL) { /* * If the allocation fails, delete the resource list entry * and disable decoding for this device. * * If the driver requests this resource in the future, * pci_reserve_map() will try to allocate a fresh * resource range. */ resource_list_delete(rl, type, reg); pci_disable_io(dev, type); if (bootverbose) device_printf(bus, "pci%d:%d:%d:%d bar %#x failed to allocate\n", pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev), reg); } else { start = rman_get_start(res); pci_write_bar(dev, pm, start); } return (barlen); } /* * For ATA devices we need to decide early what addressing mode to use. * Legacy demands that the primary and secondary ATA ports sits on the * same addresses that old ISA hardware did. This dictates that we use * those addresses and ignore the BAR's if we cannot set PCI native * addressing mode. */ static void pci_ata_maps(device_t bus, device_t dev, struct resource_list *rl, int force, uint32_t prefetchmask) { int rid, type, progif; #if 0 /* if this device supports PCI native addressing use it */ progif = pci_read_config(dev, PCIR_PROGIF, 1); if ((progif & 0x8a) == 0x8a) { if (pci_mapbase(pci_read_config(dev, PCIR_BAR(0), 4)) && pci_mapbase(pci_read_config(dev, PCIR_BAR(2), 4))) { printf("Trying ATA native PCI addressing mode\n"); pci_write_config(dev, PCIR_PROGIF, progif | 0x05, 1); } } #endif progif = pci_read_config(dev, PCIR_PROGIF, 1); type = SYS_RES_IOPORT; if (progif & PCIP_STORAGE_IDE_MODEPRIM) { pci_add_map(bus, dev, PCIR_BAR(0), rl, force, prefetchmask & (1 << 0)); pci_add_map(bus, dev, PCIR_BAR(1), rl, force, prefetchmask & (1 << 1)); } else { rid = PCIR_BAR(0); resource_list_add(rl, type, rid, 0x1f0, 0x1f7, 8); (void)resource_list_reserve(rl, bus, dev, type, &rid, 0x1f0, 0x1f7, 8, 0); rid = PCIR_BAR(1); resource_list_add(rl, type, rid, 0x3f6, 0x3f6, 1); (void)resource_list_reserve(rl, bus, dev, type, &rid, 0x3f6, 0x3f6, 1, 0); } if (progif & PCIP_STORAGE_IDE_MODESEC) { pci_add_map(bus, dev, PCIR_BAR(2), rl, force, prefetchmask & (1 << 2)); pci_add_map(bus, dev, PCIR_BAR(3), rl, force, prefetchmask & (1 << 3)); } else { rid = PCIR_BAR(2); resource_list_add(rl, type, rid, 0x170, 0x177, 8); (void)resource_list_reserve(rl, bus, dev, type, &rid, 0x170, 0x177, 8, 0); rid = PCIR_BAR(3); resource_list_add(rl, type, rid, 0x376, 0x376, 1); (void)resource_list_reserve(rl, bus, dev, type, &rid, 0x376, 0x376, 1, 0); } pci_add_map(bus, dev, PCIR_BAR(4), rl, force, prefetchmask & (1 << 4)); pci_add_map(bus, dev, PCIR_BAR(5), rl, force, prefetchmask & (1 << 5)); } static void pci_assign_interrupt(device_t bus, device_t dev, int force_route) { struct pci_devinfo *dinfo = device_get_ivars(dev); pcicfgregs *cfg = &dinfo->cfg; char tunable_name[64]; int irq; /* Has to have an intpin to have an interrupt. */ if (cfg->intpin == 0) return; /* Let the user override the IRQ with a tunable. */ irq = PCI_INVALID_IRQ; snprintf(tunable_name, sizeof(tunable_name), "hw.pci%d.%d.%d.INT%c.irq", cfg->domain, cfg->bus, cfg->slot, cfg->intpin + 'A' - 1); if (TUNABLE_INT_FETCH(tunable_name, &irq) && (irq >= 255 || irq <= 0)) irq = PCI_INVALID_IRQ; /* * If we didn't get an IRQ via the tunable, then we either use the * IRQ value in the intline register or we ask the bus to route an * interrupt for us. If force_route is true, then we only use the * value in the intline register if the bus was unable to assign an * IRQ. */ if (!PCI_INTERRUPT_VALID(irq)) { if (!PCI_INTERRUPT_VALID(cfg->intline) || force_route) irq = PCI_ASSIGN_INTERRUPT(bus, dev); if (!PCI_INTERRUPT_VALID(irq)) irq = cfg->intline; } /* If after all that we don't have an IRQ, just bail. */ if (!PCI_INTERRUPT_VALID(irq)) return; /* Update the config register if it changed. */ if (irq != cfg->intline) { cfg->intline = irq; pci_write_config(dev, PCIR_INTLINE, irq, 1); } /* Add this IRQ as rid 0 interrupt resource. */ resource_list_add(&dinfo->resources, SYS_RES_IRQ, 0, irq, irq, 1); } /* Perform early OHCI takeover from SMM. */ static void ohci_early_takeover(device_t self) { struct resource *res; uint32_t ctl; int rid; int i; rid = PCIR_BAR(0); res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (res == NULL) return; ctl = bus_read_4(res, OHCI_CONTROL); if (ctl & OHCI_IR) { if (bootverbose) printf("ohci early: " "SMM active, request owner change\n"); bus_write_4(res, OHCI_COMMAND_STATUS, OHCI_OCR); for (i = 0; (i < 100) && (ctl & OHCI_IR); i++) { DELAY(1000); ctl = bus_read_4(res, OHCI_CONTROL); } if (ctl & OHCI_IR) { if (bootverbose) printf("ohci early: " "SMM does not respond, resetting\n"); bus_write_4(res, OHCI_CONTROL, OHCI_HCFS_RESET); } /* Disable interrupts */ bus_write_4(res, OHCI_INTERRUPT_DISABLE, OHCI_ALL_INTRS); } bus_release_resource(self, SYS_RES_MEMORY, rid, res); } /* Perform early UHCI takeover from SMM. */ static void uhci_early_takeover(device_t self) { struct resource *res; int rid; /* * Set the PIRQD enable bit and switch off all the others. We don't * want legacy support to interfere with us XXX Does this also mean * that the BIOS won't touch the keyboard anymore if it is connected * to the ports of the root hub? */ pci_write_config(self, PCI_LEGSUP, PCI_LEGSUP_USBPIRQDEN, 2); /* Disable interrupts */ rid = PCI_UHCI_BASE_REG; res = bus_alloc_resource_any(self, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (res != NULL) { bus_write_2(res, UHCI_INTR, 0); bus_release_resource(self, SYS_RES_IOPORT, rid, res); } } /* Perform early EHCI takeover from SMM. */ static void ehci_early_takeover(device_t self) { struct resource *res; uint32_t cparams; uint32_t eec; uint8_t eecp; uint8_t bios_sem; uint8_t offs; int rid; int i; rid = PCIR_BAR(0); res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (res == NULL) return; cparams = bus_read_4(res, EHCI_HCCPARAMS); /* Synchronise with the BIOS if it owns the controller. */ for (eecp = EHCI_HCC_EECP(cparams); eecp != 0; eecp = EHCI_EECP_NEXT(eec)) { eec = pci_read_config(self, eecp, 4); if (EHCI_EECP_ID(eec) != EHCI_EC_LEGSUP) { continue; } bios_sem = pci_read_config(self, eecp + EHCI_LEGSUP_BIOS_SEM, 1); if (bios_sem == 0) { continue; } if (bootverbose) printf("ehci early: " "SMM active, request owner change\n"); pci_write_config(self, eecp + EHCI_LEGSUP_OS_SEM, 1, 1); for (i = 0; (i < 100) && (bios_sem != 0); i++) { DELAY(1000); bios_sem = pci_read_config(self, eecp + EHCI_LEGSUP_BIOS_SEM, 1); } if (bios_sem != 0) { if (bootverbose) printf("ehci early: " "SMM does not respond\n"); } /* Disable interrupts */ offs = EHCI_CAPLENGTH(bus_read_4(res, EHCI_CAPLEN_HCIVERSION)); bus_write_4(res, offs + EHCI_USBINTR, 0); } bus_release_resource(self, SYS_RES_MEMORY, rid, res); } /* Perform early XHCI takeover from SMM. */ static void xhci_early_takeover(device_t self) { struct resource *res; uint32_t cparams; uint32_t eec; uint8_t eecp; uint8_t bios_sem; uint8_t offs; int rid; int i; rid = PCIR_BAR(0); res = bus_alloc_resource_any(self, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (res == NULL) return; cparams = bus_read_4(res, XHCI_HCSPARAMS0); eec = -1; /* Synchronise with the BIOS if it owns the controller. */ for (eecp = XHCI_HCS0_XECP(cparams) << 2; eecp != 0 && XHCI_XECP_NEXT(eec); eecp += XHCI_XECP_NEXT(eec) << 2) { eec = bus_read_4(res, eecp); if (XHCI_XECP_ID(eec) != XHCI_ID_USB_LEGACY) continue; bios_sem = bus_read_1(res, eecp + XHCI_XECP_BIOS_SEM); if (bios_sem == 0) continue; if (bootverbose) printf("xhci early: " "SMM active, request owner change\n"); bus_write_1(res, eecp + XHCI_XECP_OS_SEM, 1); /* wait a maximum of 5 second */ for (i = 0; (i < 5000) && (bios_sem != 0); i++) { DELAY(1000); bios_sem = bus_read_1(res, eecp + XHCI_XECP_BIOS_SEM); } if (bios_sem != 0) { if (bootverbose) printf("xhci early: " "SMM does not respond\n"); } /* Disable interrupts */ offs = bus_read_1(res, XHCI_CAPLENGTH); bus_write_4(res, offs + XHCI_USBCMD, 0); bus_read_4(res, offs + XHCI_USBSTS); } bus_release_resource(self, SYS_RES_MEMORY, rid, res); } #if defined(NEW_PCIB) && defined(PCI_RES_BUS) static void pci_reserve_secbus(device_t bus, device_t dev, pcicfgregs *cfg, struct resource_list *rl) { struct resource *res; char *cp; rman_res_t start, end, count; int rid, sec_bus, sec_reg, sub_bus, sub_reg, sup_bus; switch (cfg->hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_BRIDGE: sec_reg = PCIR_SECBUS_1; sub_reg = PCIR_SUBBUS_1; break; case PCIM_HDRTYPE_CARDBUS: sec_reg = PCIR_SECBUS_2; sub_reg = PCIR_SUBBUS_2; break; default: return; } /* * If the existing bus range is valid, attempt to reserve it * from our parent. If this fails for any reason, clear the * secbus and subbus registers. * * XXX: Should we reset sub_bus to sec_bus if it is < sec_bus? * This would at least preserve the existing sec_bus if it is * valid. */ sec_bus = PCI_READ_CONFIG(bus, dev, sec_reg, 1); sub_bus = PCI_READ_CONFIG(bus, dev, sub_reg, 1); /* Quirk handling. */ switch (pci_get_devid(dev)) { case 0x12258086: /* Intel 82454KX/GX (Orion) */ sup_bus = pci_read_config(dev, 0x41, 1); if (sup_bus != 0xff) { sec_bus = sup_bus + 1; sub_bus = sup_bus + 1; PCI_WRITE_CONFIG(bus, dev, sec_reg, sec_bus, 1); PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1); } break; case 0x00dd10de: /* Compaq R3000 BIOS sets wrong subordinate bus number. */ if ((cp = kern_getenv("smbios.planar.maker")) == NULL) break; if (strncmp(cp, "Compal", 6) != 0) { freeenv(cp); break; } freeenv(cp); if ((cp = kern_getenv("smbios.planar.product")) == NULL) break; if (strncmp(cp, "08A0", 4) != 0) { freeenv(cp); break; } freeenv(cp); if (sub_bus < 0xa) { sub_bus = 0xa; PCI_WRITE_CONFIG(bus, dev, sub_reg, sub_bus, 1); } break; } if (bootverbose) printf("\tsecbus=%d, subbus=%d\n", sec_bus, sub_bus); if (sec_bus > 0 && sub_bus >= sec_bus) { start = sec_bus; end = sub_bus; count = end - start + 1; resource_list_add(rl, PCI_RES_BUS, 0, 0, ~0, count); /* * If requested, clear secondary bus registers in * bridge devices to force a complete renumbering * rather than reserving the existing range. However, * preserve the existing size. */ if (pci_clear_buses) goto clear; rid = 0; res = resource_list_reserve(rl, bus, dev, PCI_RES_BUS, &rid, start, end, count, 0); if (res != NULL) return; if (bootverbose) device_printf(bus, "pci%d:%d:%d:%d secbus failed to allocate\n", pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); } clear: PCI_WRITE_CONFIG(bus, dev, sec_reg, 0, 1); PCI_WRITE_CONFIG(bus, dev, sub_reg, 0, 1); } static struct resource * pci_alloc_secbus(device_t dev, device_t child, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct pci_devinfo *dinfo; pcicfgregs *cfg; struct resource_list *rl; struct resource *res; int sec_reg, sub_reg; dinfo = device_get_ivars(child); cfg = &dinfo->cfg; rl = &dinfo->resources; switch (cfg->hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_BRIDGE: sec_reg = PCIR_SECBUS_1; sub_reg = PCIR_SUBBUS_1; break; case PCIM_HDRTYPE_CARDBUS: sec_reg = PCIR_SECBUS_2; sub_reg = PCIR_SUBBUS_2; break; default: return (NULL); } if (*rid != 0) return (NULL); if (resource_list_find(rl, PCI_RES_BUS, *rid) == NULL) resource_list_add(rl, PCI_RES_BUS, *rid, start, end, count); if (!resource_list_reserved(rl, PCI_RES_BUS, *rid)) { res = resource_list_reserve(rl, dev, child, PCI_RES_BUS, rid, start, end, count, flags & ~RF_ACTIVE); if (res == NULL) { resource_list_delete(rl, PCI_RES_BUS, *rid); device_printf(child, "allocating %ju bus%s failed\n", count, count == 1 ? "" : "es"); return (NULL); } if (bootverbose) device_printf(child, "Lazy allocation of %ju bus%s at %ju\n", count, count == 1 ? "" : "es", rman_get_start(res)); PCI_WRITE_CONFIG(dev, child, sec_reg, rman_get_start(res), 1); PCI_WRITE_CONFIG(dev, child, sub_reg, rman_get_end(res), 1); } return (resource_list_alloc(rl, dev, child, PCI_RES_BUS, rid, start, end, count, flags)); } #endif static int pci_ea_bei_to_rid(device_t dev, int bei) { #ifdef PCI_IOV struct pci_devinfo *dinfo; int iov_pos; struct pcicfg_iov *iov; dinfo = device_get_ivars(dev); iov = dinfo->cfg.iov; if (iov != NULL) iov_pos = iov->iov_pos; else iov_pos = 0; #endif /* Check if matches BAR */ if ((bei >= PCIM_EA_BEI_BAR_0) && (bei <= PCIM_EA_BEI_BAR_5)) return (PCIR_BAR(bei)); /* Check ROM */ if (bei == PCIM_EA_BEI_ROM) return (PCIR_BIOS); #ifdef PCI_IOV /* Check if matches VF_BAR */ if ((iov != NULL) && (bei >= PCIM_EA_BEI_VF_BAR_0) && (bei <= PCIM_EA_BEI_VF_BAR_5)) return (PCIR_SRIOV_BAR(bei - PCIM_EA_BEI_VF_BAR_0) + iov_pos); #endif return (-1); } int pci_ea_is_enabled(device_t dev, int rid) { struct pci_ea_entry *ea; struct pci_devinfo *dinfo; dinfo = device_get_ivars(dev); STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) { if (pci_ea_bei_to_rid(dev, ea->eae_bei) == rid) return ((ea->eae_flags & PCIM_EA_ENABLE) > 0); } return (0); } void pci_add_resources_ea(device_t bus, device_t dev, int alloc_iov) { struct pci_ea_entry *ea; struct pci_devinfo *dinfo; pci_addr_t start, end, count; struct resource_list *rl; int type, flags, rid; struct resource *res; uint32_t tmp; #ifdef PCI_IOV struct pcicfg_iov *iov; #endif dinfo = device_get_ivars(dev); rl = &dinfo->resources; flags = 0; #ifdef PCI_IOV iov = dinfo->cfg.iov; #endif if (dinfo->cfg.ea.ea_location == 0) return; STAILQ_FOREACH(ea, &dinfo->cfg.ea.ea_entries, eae_link) { /* * TODO: Ignore EA-BAR if is not enabled. * Currently the EA implementation supports * only situation, where EA structure contains * predefined entries. In case they are not enabled * leave them unallocated and proceed with * a legacy-BAR mechanism. */ if ((ea->eae_flags & PCIM_EA_ENABLE) == 0) continue; switch ((ea->eae_flags & PCIM_EA_PP) >> PCIM_EA_PP_OFFSET) { case PCIM_EA_P_MEM_PREFETCH: case PCIM_EA_P_VF_MEM_PREFETCH: flags = RF_PREFETCHABLE; /* FALLTHROUGH */ case PCIM_EA_P_VF_MEM: case PCIM_EA_P_MEM: type = SYS_RES_MEMORY; break; case PCIM_EA_P_IO: type = SYS_RES_IOPORT; break; default: continue; } if (alloc_iov != 0) { #ifdef PCI_IOV /* Allocating IOV, confirm BEI matches */ if ((ea->eae_bei < PCIM_EA_BEI_VF_BAR_0) || (ea->eae_bei > PCIM_EA_BEI_VF_BAR_5)) continue; #else continue; #endif } else { /* Allocating BAR, confirm BEI matches */ if (((ea->eae_bei < PCIM_EA_BEI_BAR_0) || (ea->eae_bei > PCIM_EA_BEI_BAR_5)) && (ea->eae_bei != PCIM_EA_BEI_ROM)) continue; } rid = pci_ea_bei_to_rid(dev, ea->eae_bei); if (rid < 0) continue; /* Skip resources already allocated by EA */ if ((resource_list_find(rl, SYS_RES_MEMORY, rid) != NULL) || (resource_list_find(rl, SYS_RES_IOPORT, rid) != NULL)) continue; start = ea->eae_base; count = ea->eae_max_offset + 1; #ifdef PCI_IOV if (iov != NULL) count = count * iov->iov_num_vfs; #endif end = start + count - 1; if (count == 0) continue; resource_list_add(rl, type, rid, start, end, count); res = resource_list_reserve(rl, bus, dev, type, &rid, start, end, count, flags); if (res == NULL) { resource_list_delete(rl, type, rid); /* * Failed to allocate using EA, disable entry. * Another attempt to allocation will be performed * further, but this time using legacy BAR registers */ tmp = pci_read_config(dev, ea->eae_cfg_offset, 4); tmp &= ~PCIM_EA_ENABLE; pci_write_config(dev, ea->eae_cfg_offset, tmp, 4); /* * Disabling entry might fail in case it is hardwired. * Read flags again to match current status. */ ea->eae_flags = pci_read_config(dev, ea->eae_cfg_offset, 4); continue; } /* As per specification, fill BAR with zeros */ pci_write_config(dev, rid, 0, 4); } } void pci_add_resources(device_t bus, device_t dev, int force, uint32_t prefetchmask) { struct pci_devinfo *dinfo; pcicfgregs *cfg; struct resource_list *rl; const struct pci_quirk *q; uint32_t devid; int i; dinfo = device_get_ivars(dev); cfg = &dinfo->cfg; rl = &dinfo->resources; devid = (cfg->device << 16) | cfg->vendor; /* Allocate resources using Enhanced Allocation */ pci_add_resources_ea(bus, dev, 0); /* ATA devices needs special map treatment */ if ((pci_get_class(dev) == PCIC_STORAGE) && (pci_get_subclass(dev) == PCIS_STORAGE_IDE) && ((pci_get_progif(dev) & PCIP_STORAGE_IDE_MASTERDEV) || (!pci_read_config(dev, PCIR_BAR(0), 4) && !pci_read_config(dev, PCIR_BAR(2), 4))) ) pci_ata_maps(bus, dev, rl, force, prefetchmask); else for (i = 0; i < cfg->nummaps;) { /* Skip resources already managed by EA */ if ((resource_list_find(rl, SYS_RES_MEMORY, PCIR_BAR(i)) != NULL) || (resource_list_find(rl, SYS_RES_IOPORT, PCIR_BAR(i)) != NULL) || pci_ea_is_enabled(dev, PCIR_BAR(i))) { i++; continue; } /* * Skip quirked resources. */ for (q = &pci_quirks[0]; q->devid != 0; q++) if (q->devid == devid && q->type == PCI_QUIRK_UNMAP_REG && q->arg1 == PCIR_BAR(i)) break; if (q->devid != 0) { i++; continue; } i += pci_add_map(bus, dev, PCIR_BAR(i), rl, force, prefetchmask & (1 << i)); } /* * Add additional, quirked resources. */ for (q = &pci_quirks[0]; q->devid != 0; q++) if (q->devid == devid && q->type == PCI_QUIRK_MAP_REG) pci_add_map(bus, dev, q->arg1, rl, force, 0); if (cfg->intpin > 0 && PCI_INTERRUPT_VALID(cfg->intline)) { #ifdef __PCI_REROUTE_INTERRUPT /* * Try to re-route interrupts. Sometimes the BIOS or * firmware may leave bogus values in these registers. * If the re-route fails, then just stick with what we * have. */ pci_assign_interrupt(bus, dev, 1); #else pci_assign_interrupt(bus, dev, 0); #endif } if (pci_usb_takeover && pci_get_class(dev) == PCIC_SERIALBUS && pci_get_subclass(dev) == PCIS_SERIALBUS_USB) { if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_XHCI) xhci_early_takeover(dev); else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_EHCI) ehci_early_takeover(dev); else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_OHCI) ohci_early_takeover(dev); else if (pci_get_progif(dev) == PCIP_SERIALBUS_USB_UHCI) uhci_early_takeover(dev); } #if defined(NEW_PCIB) && defined(PCI_RES_BUS) /* * Reserve resources for secondary bus ranges behind bridge * devices. */ pci_reserve_secbus(bus, dev, cfg, rl); #endif } static struct pci_devinfo * pci_identify_function(device_t pcib, device_t dev, int domain, int busno, int slot, int func) { struct pci_devinfo *dinfo; dinfo = pci_read_device(pcib, dev, domain, busno, slot, func); if (dinfo != NULL) pci_add_child(dev, dinfo); return (dinfo); } void pci_add_children(device_t dev, int domain, int busno) { #define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w) device_t pcib = device_get_parent(dev); struct pci_devinfo *dinfo; int maxslots; int s, f, pcifunchigh; uint8_t hdrtype; int first_func; /* * Try to detect a device at slot 0, function 0. If it exists, try to * enable ARI. We must enable ARI before detecting the rest of the * functions on this bus as ARI changes the set of slots and functions * that are legal on this bus. */ dinfo = pci_identify_function(pcib, dev, domain, busno, 0, 0); if (dinfo != NULL && pci_enable_ari) PCIB_TRY_ENABLE_ARI(pcib, dinfo->cfg.dev); /* * Start looking for new devices on slot 0 at function 1 because we * just identified the device at slot 0, function 0. */ first_func = 1; maxslots = PCIB_MAXSLOTS(pcib); for (s = 0; s <= maxslots; s++, first_func = 0) { pcifunchigh = 0; f = 0; DELAY(1); /* If function 0 is not present, skip to the next slot. */ if (REG(PCIR_VENDOR, 2) == PCIV_INVALID) continue; hdrtype = REG(PCIR_HDRTYPE, 1); if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE) continue; if (hdrtype & PCIM_MFDEV) pcifunchigh = PCIB_MAXFUNCS(pcib); for (f = first_func; f <= pcifunchigh; f++) pci_identify_function(pcib, dev, domain, busno, s, f); } #undef REG } int pci_rescan_method(device_t dev) { #define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w) device_t pcib = device_get_parent(dev); device_t child, *devlist, *unchanged; int devcount, error, i, j, maxslots, oldcount; int busno, domain, s, f, pcifunchigh; uint8_t hdrtype; /* No need to check for ARI on a rescan. */ error = device_get_children(dev, &devlist, &devcount); if (error) return (error); if (devcount != 0) { unchanged = malloc(devcount * sizeof(device_t), M_TEMP, M_NOWAIT | M_ZERO); if (unchanged == NULL) { free(devlist, M_TEMP); return (ENOMEM); } } else unchanged = NULL; domain = pcib_get_domain(dev); busno = pcib_get_bus(dev); maxslots = PCIB_MAXSLOTS(pcib); for (s = 0; s <= maxslots; s++) { /* If function 0 is not present, skip to the next slot. */ f = 0; if (REG(PCIR_VENDOR, 2) == PCIV_INVALID) continue; pcifunchigh = 0; hdrtype = REG(PCIR_HDRTYPE, 1); if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE) continue; if (hdrtype & PCIM_MFDEV) pcifunchigh = PCIB_MAXFUNCS(pcib); for (f = 0; f <= pcifunchigh; f++) { if (REG(PCIR_VENDOR, 2) == PCIV_INVALID) continue; /* * Found a valid function. Check if a * device_t for this device already exists. */ for (i = 0; i < devcount; i++) { child = devlist[i]; if (child == NULL) continue; if (pci_get_slot(child) == s && pci_get_function(child) == f) { unchanged[i] = child; goto next_func; } } pci_identify_function(pcib, dev, domain, busno, s, f); next_func:; } } /* Remove devices that are no longer present. */ for (i = 0; i < devcount; i++) { if (unchanged[i] != NULL) continue; device_delete_child(dev, devlist[i]); } free(devlist, M_TEMP); oldcount = devcount; /* Try to attach the devices just added. */ error = device_get_children(dev, &devlist, &devcount); if (error) { free(unchanged, M_TEMP); return (error); } for (i = 0; i < devcount; i++) { for (j = 0; j < oldcount; j++) { if (devlist[i] == unchanged[j]) goto next_device; } device_probe_and_attach(devlist[i]); next_device:; } free(unchanged, M_TEMP); free(devlist, M_TEMP); return (0); #undef REG } #ifdef PCI_IOV device_t pci_add_iov_child(device_t bus, device_t pf, uint16_t rid, uint16_t vid, uint16_t did) { struct pci_devinfo *vf_dinfo; device_t pcib; int busno, slot, func; pcib = device_get_parent(bus); PCIB_DECODE_RID(pcib, rid, &busno, &slot, &func); vf_dinfo = pci_fill_devinfo(pcib, bus, pci_get_domain(pcib), busno, slot, func, vid, did); vf_dinfo->cfg.flags |= PCICFG_VF; pci_add_child(bus, vf_dinfo); return (vf_dinfo->cfg.dev); } device_t pci_create_iov_child_method(device_t bus, device_t pf, uint16_t rid, uint16_t vid, uint16_t did) { return (pci_add_iov_child(bus, pf, rid, vid, did)); } #endif /* * For PCIe device set Max_Payload_Size to match PCIe root's. */ static void pcie_setup_mps(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); device_t root; uint16_t rmps, mmps, mps; if (dinfo->cfg.pcie.pcie_location == 0) return; root = pci_find_pcie_root_port(dev); if (root == NULL) return; /* Check whether the MPS is already configured. */ rmps = pcie_read_config(root, PCIER_DEVICE_CTL, 2) & PCIEM_CTL_MAX_PAYLOAD; mps = pcie_read_config(dev, PCIER_DEVICE_CTL, 2) & PCIEM_CTL_MAX_PAYLOAD; if (mps == rmps) return; /* Check whether the device is capable of the root's MPS. */ mmps = (pcie_read_config(dev, PCIER_DEVICE_CAP, 2) & PCIEM_CAP_MAX_PAYLOAD) << 5; if (rmps > mmps) { /* * The device is unable to handle root's MPS. Limit root. * XXX: We should traverse through all the tree, applying * it to all the devices. */ pcie_adjust_config(root, PCIER_DEVICE_CTL, PCIEM_CTL_MAX_PAYLOAD, mmps, 2); } else { pcie_adjust_config(dev, PCIER_DEVICE_CTL, PCIEM_CTL_MAX_PAYLOAD, rmps, 2); } } static void pci_add_child_clear_aer(device_t dev, struct pci_devinfo *dinfo) { int aer; uint32_t r; uint16_t r2; if (dinfo->cfg.pcie.pcie_location != 0 && dinfo->cfg.pcie.pcie_type == PCIEM_TYPE_ROOT_PORT) { r2 = pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_ROOT_CTL, 2); r2 &= ~(PCIEM_ROOT_CTL_SERR_CORR | PCIEM_ROOT_CTL_SERR_NONFATAL | PCIEM_ROOT_CTL_SERR_FATAL); pci_write_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_ROOT_CTL, r2, 2); } if (pci_find_extcap(dev, PCIZ_AER, &aer) == 0) { r = pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4); pci_write_config(dev, aer + PCIR_AER_UC_STATUS, r, 4); if (r != 0 && bootverbose) { pci_printf(&dinfo->cfg, "clearing AER UC 0x%08x -> 0x%08x\n", r, pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4)); } r = pci_read_config(dev, aer + PCIR_AER_UC_MASK, 4); r &= ~(PCIM_AER_UC_TRAINING_ERROR | PCIM_AER_UC_DL_PROTOCOL_ERROR | PCIM_AER_UC_SURPRISE_LINK_DOWN | PCIM_AER_UC_POISONED_TLP | PCIM_AER_UC_FC_PROTOCOL_ERROR | PCIM_AER_UC_COMPLETION_TIMEOUT | PCIM_AER_UC_COMPLETER_ABORT | PCIM_AER_UC_UNEXPECTED_COMPLETION | PCIM_AER_UC_RECEIVER_OVERFLOW | PCIM_AER_UC_MALFORMED_TLP | PCIM_AER_UC_ECRC_ERROR | PCIM_AER_UC_UNSUPPORTED_REQUEST | PCIM_AER_UC_ACS_VIOLATION | PCIM_AER_UC_INTERNAL_ERROR | PCIM_AER_UC_MC_BLOCKED_TLP | PCIM_AER_UC_ATOMIC_EGRESS_BLK | PCIM_AER_UC_TLP_PREFIX_BLOCKED); pci_write_config(dev, aer + PCIR_AER_UC_MASK, r, 4); r = pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4); pci_write_config(dev, aer + PCIR_AER_COR_STATUS, r, 4); if (r != 0 && bootverbose) { pci_printf(&dinfo->cfg, "clearing AER COR 0x%08x -> 0x%08x\n", r, pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4)); } r = pci_read_config(dev, aer + PCIR_AER_COR_MASK, 4); r &= ~(PCIM_AER_COR_RECEIVER_ERROR | PCIM_AER_COR_BAD_TLP | PCIM_AER_COR_BAD_DLLP | PCIM_AER_COR_REPLAY_ROLLOVER | PCIM_AER_COR_REPLAY_TIMEOUT | PCIM_AER_COR_ADVISORY_NF_ERROR | PCIM_AER_COR_INTERNAL_ERROR | PCIM_AER_COR_HEADER_LOG_OVFLOW); pci_write_config(dev, aer + PCIR_AER_COR_MASK, r, 4); r = pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_CTL, 2); r |= PCIEM_CTL_COR_ENABLE | PCIEM_CTL_NFER_ENABLE | PCIEM_CTL_FER_ENABLE | PCIEM_CTL_URR_ENABLE; pci_write_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_CTL, r, 2); } } void pci_add_child(device_t bus, struct pci_devinfo *dinfo) { device_t dev; dinfo->cfg.dev = dev = device_add_child(bus, NULL, -1); device_set_ivars(dev, dinfo); resource_list_init(&dinfo->resources); pci_cfg_save(dev, dinfo, 0); pci_cfg_restore(dev, dinfo); pci_print_verbose(dinfo); pci_add_resources(bus, dev, 0, 0); pcie_setup_mps(dev); pci_child_added(dinfo->cfg.dev); if (pci_clear_aer_on_attach) pci_add_child_clear_aer(dev, dinfo); EVENTHANDLER_INVOKE(pci_add_device, dinfo->cfg.dev); } void pci_child_added_method(device_t dev, device_t child) { } static int pci_probe(device_t dev) { device_set_desc(dev, "PCI bus"); /* Allow other subclasses to override this driver. */ return (BUS_PROBE_GENERIC); } int pci_attach_common(device_t dev) { struct pci_softc *sc; int busno, domain; #ifdef PCI_RES_BUS int rid; #endif sc = device_get_softc(dev); domain = pcib_get_domain(dev); busno = pcib_get_bus(dev); #ifdef PCI_RES_BUS rid = 0; sc->sc_bus = bus_alloc_resource(dev, PCI_RES_BUS, &rid, busno, busno, 1, 0); if (sc->sc_bus == NULL) { device_printf(dev, "failed to allocate bus number\n"); return (ENXIO); } #endif if (bootverbose) device_printf(dev, "domain=%d, physical bus=%d\n", domain, busno); sc->sc_dma_tag = bus_get_dma_tag(dev); return (0); } int pci_attach(device_t dev) { int busno, domain, error; error = pci_attach_common(dev); if (error) return (error); /* * Since there can be multiple independently numbered PCI * buses on systems with multiple PCI domains, we can't use * the unit number to decide which bus we are probing. We ask * the parent pcib what our domain and bus numbers are. */ domain = pcib_get_domain(dev); busno = pcib_get_bus(dev); pci_add_children(dev, domain, busno); return (bus_generic_attach(dev)); } int pci_detach(device_t dev) { #ifdef PCI_RES_BUS struct pci_softc *sc; #endif int error; error = bus_generic_detach(dev); if (error) return (error); #ifdef PCI_RES_BUS sc = device_get_softc(dev); error = bus_release_resource(dev, PCI_RES_BUS, 0, sc->sc_bus); if (error) return (error); #endif return (device_delete_children(dev)); } static void pci_hint_device_unit(device_t dev, device_t child, const char *name, int *unitp) { int line, unit; const char *at; char me1[24], me2[32]; uint8_t b, s, f; uint32_t d; device_location_cache_t *cache; d = pci_get_domain(child); b = pci_get_bus(child); s = pci_get_slot(child); f = pci_get_function(child); snprintf(me1, sizeof(me1), "pci%u:%u:%u", b, s, f); snprintf(me2, sizeof(me2), "pci%u:%u:%u:%u", d, b, s, f); line = 0; cache = dev_wired_cache_init(); while (resource_find_dev(&line, name, &unit, "at", NULL) == 0) { resource_string_value(name, unit, "at", &at); if (strcmp(at, me1) == 0 || strcmp(at, me2) == 0) { *unitp = unit; break; } if (dev_wired_cache_match(cache, child, at)) { *unitp = unit; break; } } dev_wired_cache_fini(cache); } static void pci_set_power_child(device_t dev, device_t child, int state) { device_t pcib; int dstate; /* * Set the device to the given state. If the firmware suggests * a different power state, use it instead. If power management * is not present, the firmware is responsible for managing * device power. Skip children who aren't attached since they * are handled separately. */ pcib = device_get_parent(dev); dstate = state; if (device_is_attached(child) && PCIB_POWER_FOR_SLEEP(pcib, child, &dstate) == 0) pci_set_powerstate(child, dstate); } int pci_suspend_child(device_t dev, device_t child) { struct pci_devinfo *dinfo; struct resource_list_entry *rle; int error; dinfo = device_get_ivars(child); /* * Save the PCI configuration space for the child and set the * device in the appropriate power state for this sleep state. */ pci_cfg_save(child, dinfo, 0); /* Suspend devices before potentially powering them down. */ error = bus_generic_suspend_child(dev, child); if (error) return (error); if (pci_do_power_suspend) { /* * Make sure this device's interrupt handler is not invoked * in the case the device uses a shared interrupt that can * be raised by some other device. * This is applicable only to regular (legacy) PCI interrupts * as MSI/MSI-X interrupts are never shared. */ rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0); if (rle != NULL && rle->res != NULL) (void)bus_suspend_intr(child, rle->res); pci_set_power_child(dev, child, PCI_POWERSTATE_D3); } return (0); } int pci_resume_child(device_t dev, device_t child) { struct pci_devinfo *dinfo; struct resource_list_entry *rle; if (pci_do_power_resume) pci_set_power_child(dev, child, PCI_POWERSTATE_D0); dinfo = device_get_ivars(child); pci_cfg_restore(child, dinfo); if (!device_is_attached(child)) pci_cfg_save(child, dinfo, 1); bus_generic_resume_child(dev, child); /* * Allow interrupts only after fully resuming the driver and hardware. */ if (pci_do_power_suspend) { /* See pci_suspend_child for details. */ rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, 0); if (rle != NULL && rle->res != NULL) (void)bus_resume_intr(child, rle->res); } return (0); } int pci_resume(device_t dev) { device_t child, *devlist; int error, i, numdevs; if ((error = device_get_children(dev, &devlist, &numdevs)) != 0) return (error); /* * Resume critical devices first, then everything else later. */ for (i = 0; i < numdevs; i++) { child = devlist[i]; switch (pci_get_class(child)) { case PCIC_DISPLAY: case PCIC_MEMORY: case PCIC_BRIDGE: case PCIC_BASEPERIPH: BUS_RESUME_CHILD(dev, child); break; } } for (i = 0; i < numdevs; i++) { child = devlist[i]; switch (pci_get_class(child)) { case PCIC_DISPLAY: case PCIC_MEMORY: case PCIC_BRIDGE: case PCIC_BASEPERIPH: break; default: BUS_RESUME_CHILD(dev, child); } } free(devlist, M_TEMP); return (0); } static void pci_load_vendor_data(void) { caddr_t data; void *ptr; size_t sz; data = preload_search_by_type("pci_vendor_data"); if (data != NULL) { ptr = preload_fetch_addr(data); sz = preload_fetch_size(data); if (ptr != NULL && sz != 0) { pci_vendordata = ptr; pci_vendordata_size = sz; /* terminate the database */ pci_vendordata[pci_vendordata_size] = '\n'; } } } void pci_driver_added(device_t dev, driver_t *driver) { int numdevs; device_t *devlist; device_t child; struct pci_devinfo *dinfo; int i; if (bootverbose) device_printf(dev, "driver added\n"); DEVICE_IDENTIFY(driver, dev); if (device_get_children(dev, &devlist, &numdevs) != 0) return; for (i = 0; i < numdevs; i++) { child = devlist[i]; if (device_get_state(child) != DS_NOTPRESENT) continue; dinfo = device_get_ivars(child); pci_print_verbose(dinfo); if (bootverbose) pci_printf(&dinfo->cfg, "reprobing on driver added\n"); pci_cfg_restore(child, dinfo); if (device_probe_and_attach(child) != 0) pci_child_detached(dev, child); } free(devlist, M_TEMP); } int pci_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { struct pci_devinfo *dinfo; struct msix_table_entry *mte; struct msix_vector *mv; uint64_t addr; uint32_t data; void *cookie; int error, rid; error = bus_generic_setup_intr(dev, child, irq, flags, filter, intr, arg, &cookie); if (error) return (error); /* If this is not a direct child, just bail out. */ if (device_get_parent(child) != dev) { *cookiep = cookie; return(0); } rid = rman_get_rid(irq); if (rid == 0) { /* Make sure that INTx is enabled */ pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS); } else { /* * Check to see if the interrupt is MSI or MSI-X. * Ask our parent to map the MSI and give * us the address and data register values. * If we fail for some reason, teardown the * interrupt handler. */ dinfo = device_get_ivars(child); if (dinfo->cfg.msi.msi_alloc > 0) { if (dinfo->cfg.msi.msi_addr == 0) { KASSERT(dinfo->cfg.msi.msi_handlers == 0, ("MSI has handlers, but vectors not mapped")); error = PCIB_MAP_MSI(device_get_parent(dev), child, rman_get_start(irq), &addr, &data); if (error) goto bad; dinfo->cfg.msi.msi_addr = addr; dinfo->cfg.msi.msi_data = data; } if (dinfo->cfg.msi.msi_handlers == 0) pci_enable_msi(child, dinfo->cfg.msi.msi_addr, dinfo->cfg.msi.msi_data); dinfo->cfg.msi.msi_handlers++; } else { KASSERT(dinfo->cfg.msix.msix_alloc > 0, ("No MSI or MSI-X interrupts allocated")); KASSERT(rid <= dinfo->cfg.msix.msix_table_len, ("MSI-X index too high")); mte = &dinfo->cfg.msix.msix_table[rid - 1]; KASSERT(mte->mte_vector != 0, ("no message vector")); mv = &dinfo->cfg.msix.msix_vectors[mte->mte_vector - 1]; KASSERT(mv->mv_irq == rman_get_start(irq), ("IRQ mismatch")); if (mv->mv_address == 0) { KASSERT(mte->mte_handlers == 0, ("MSI-X table entry has handlers, but vector not mapped")); error = PCIB_MAP_MSI(device_get_parent(dev), child, rman_get_start(irq), &addr, &data); if (error) goto bad; mv->mv_address = addr; mv->mv_data = data; } /* * The MSIX table entry must be made valid by * incrementing the mte_handlers before * calling pci_enable_msix() and * pci_resume_msix(). Else the MSIX rewrite * table quirk will not work as expected. */ mte->mte_handlers++; if (mte->mte_handlers == 1) { pci_enable_msix(child, rid - 1, mv->mv_address, mv->mv_data); pci_unmask_msix(child, rid - 1); } } /* * Make sure that INTx is disabled if we are using MSI/MSI-X, * unless the device is affected by PCI_QUIRK_MSI_INTX_BUG, * in which case we "enable" INTx so MSI/MSI-X actually works. */ if (!pci_has_quirk(pci_get_devid(child), PCI_QUIRK_MSI_INTX_BUG)) pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS); else pci_clear_command_bit(dev, child, PCIM_CMD_INTxDIS); bad: if (error) { (void)bus_generic_teardown_intr(dev, child, irq, cookie); return (error); } } *cookiep = cookie; return (0); } int pci_teardown_intr(device_t dev, device_t child, struct resource *irq, void *cookie) { struct msix_table_entry *mte; struct resource_list_entry *rle; struct pci_devinfo *dinfo; int error, rid; if (irq == NULL || !(rman_get_flags(irq) & RF_ACTIVE)) return (EINVAL); /* If this isn't a direct child, just bail out */ if (device_get_parent(child) != dev) return(bus_generic_teardown_intr(dev, child, irq, cookie)); rid = rman_get_rid(irq); if (rid == 0) { /* Mask INTx */ pci_set_command_bit(dev, child, PCIM_CMD_INTxDIS); } else { /* * Check to see if the interrupt is MSI or MSI-X. If so, * decrement the appropriate handlers count and mask the * MSI-X message, or disable MSI messages if the count * drops to 0. */ dinfo = device_get_ivars(child); rle = resource_list_find(&dinfo->resources, SYS_RES_IRQ, rid); if (rle->res != irq) return (EINVAL); if (dinfo->cfg.msi.msi_alloc > 0) { KASSERT(rid <= dinfo->cfg.msi.msi_alloc, ("MSI-X index too high")); if (dinfo->cfg.msi.msi_handlers == 0) return (EINVAL); dinfo->cfg.msi.msi_handlers--; if (dinfo->cfg.msi.msi_handlers == 0) pci_disable_msi(child); } else { KASSERT(dinfo->cfg.msix.msix_alloc > 0, ("No MSI or MSI-X interrupts allocated")); KASSERT(rid <= dinfo->cfg.msix.msix_table_len, ("MSI-X index too high")); mte = &dinfo->cfg.msix.msix_table[rid - 1]; if (mte->mte_handlers == 0) return (EINVAL); mte->mte_handlers--; if (mte->mte_handlers == 0) pci_mask_msix(child, rid - 1); } } error = bus_generic_teardown_intr(dev, child, irq, cookie); if (rid > 0) KASSERT(error == 0, ("%s: generic teardown failed for MSI/MSI-X", __func__)); return (error); } int pci_print_child(device_t dev, device_t child) { struct pci_devinfo *dinfo; struct resource_list *rl; int retval = 0; dinfo = device_get_ivars(child); rl = &dinfo->resources; retval += bus_print_child_header(dev, child); retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); retval += resource_list_print_type(rl, "mem", SYS_RES_MEMORY, "%#jx"); retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); if (device_get_flags(dev)) retval += printf(" flags %#x", device_get_flags(dev)); retval += printf(" at device %d.%d", pci_get_slot(child), pci_get_function(child)); retval += bus_print_child_domain(dev, child); retval += bus_print_child_footer(dev, child); return (retval); } static const struct { int class; int subclass; int report; /* 0 = bootverbose, 1 = always */ const char *desc; } pci_nomatch_tab[] = { {PCIC_OLD, -1, 1, "old"}, {PCIC_OLD, PCIS_OLD_NONVGA, 1, "non-VGA display device"}, {PCIC_OLD, PCIS_OLD_VGA, 1, "VGA-compatible display device"}, {PCIC_STORAGE, -1, 1, "mass storage"}, {PCIC_STORAGE, PCIS_STORAGE_SCSI, 1, "SCSI"}, {PCIC_STORAGE, PCIS_STORAGE_IDE, 1, "ATA"}, {PCIC_STORAGE, PCIS_STORAGE_FLOPPY, 1, "floppy disk"}, {PCIC_STORAGE, PCIS_STORAGE_IPI, 1, "IPI"}, {PCIC_STORAGE, PCIS_STORAGE_RAID, 1, "RAID"}, {PCIC_STORAGE, PCIS_STORAGE_ATA_ADMA, 1, "ATA (ADMA)"}, {PCIC_STORAGE, PCIS_STORAGE_SATA, 1, "SATA"}, {PCIC_STORAGE, PCIS_STORAGE_SAS, 1, "SAS"}, {PCIC_STORAGE, PCIS_STORAGE_NVM, 1, "NVM"}, {PCIC_NETWORK, -1, 1, "network"}, {PCIC_NETWORK, PCIS_NETWORK_ETHERNET, 1, "ethernet"}, {PCIC_NETWORK, PCIS_NETWORK_TOKENRING, 1, "token ring"}, {PCIC_NETWORK, PCIS_NETWORK_FDDI, 1, "fddi"}, {PCIC_NETWORK, PCIS_NETWORK_ATM, 1, "ATM"}, {PCIC_NETWORK, PCIS_NETWORK_ISDN, 1, "ISDN"}, {PCIC_DISPLAY, -1, 1, "display"}, {PCIC_DISPLAY, PCIS_DISPLAY_VGA, 1, "VGA"}, {PCIC_DISPLAY, PCIS_DISPLAY_XGA, 1, "XGA"}, {PCIC_DISPLAY, PCIS_DISPLAY_3D, 1, "3D"}, {PCIC_MULTIMEDIA, -1, 1, "multimedia"}, {PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_VIDEO, 1, "video"}, {PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_AUDIO, 1, "audio"}, {PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_TELE, 1, "telephony"}, {PCIC_MULTIMEDIA, PCIS_MULTIMEDIA_HDA, 1, "HDA"}, {PCIC_MEMORY, -1, 1, "memory"}, {PCIC_MEMORY, PCIS_MEMORY_RAM, 1, "RAM"}, {PCIC_MEMORY, PCIS_MEMORY_FLASH, 1, "flash"}, {PCIC_BRIDGE, -1, 1, "bridge"}, {PCIC_BRIDGE, PCIS_BRIDGE_HOST, 1, "HOST-PCI"}, {PCIC_BRIDGE, PCIS_BRIDGE_ISA, 1, "PCI-ISA"}, {PCIC_BRIDGE, PCIS_BRIDGE_EISA, 1, "PCI-EISA"}, {PCIC_BRIDGE, PCIS_BRIDGE_MCA, 1, "PCI-MCA"}, {PCIC_BRIDGE, PCIS_BRIDGE_PCI, 1, "PCI-PCI"}, {PCIC_BRIDGE, PCIS_BRIDGE_PCMCIA, 1, "PCI-PCMCIA"}, {PCIC_BRIDGE, PCIS_BRIDGE_NUBUS, 1, "PCI-NuBus"}, {PCIC_BRIDGE, PCIS_BRIDGE_CARDBUS, 1, "PCI-CardBus"}, {PCIC_BRIDGE, PCIS_BRIDGE_RACEWAY, 1, "PCI-RACEway"}, {PCIC_SIMPLECOMM, -1, 1, "simple comms"}, {PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_UART, 1, "UART"}, /* could detect 16550 */ {PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_PAR, 1, "parallel port"}, {PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MULSER, 1, "multiport serial"}, {PCIC_SIMPLECOMM, PCIS_SIMPLECOMM_MODEM, 1, "generic modem"}, {PCIC_BASEPERIPH, -1, 0, "base peripheral"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_PIC, 1, "interrupt controller"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_DMA, 1, "DMA controller"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_TIMER, 1, "timer"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_RTC, 1, "realtime clock"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_PCIHOT, 1, "PCI hot-plug controller"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_SDHC, 1, "SD host controller"}, {PCIC_BASEPERIPH, PCIS_BASEPERIPH_IOMMU, 1, "IOMMU"}, {PCIC_INPUTDEV, -1, 1, "input device"}, {PCIC_INPUTDEV, PCIS_INPUTDEV_KEYBOARD, 1, "keyboard"}, {PCIC_INPUTDEV, PCIS_INPUTDEV_DIGITIZER,1, "digitizer"}, {PCIC_INPUTDEV, PCIS_INPUTDEV_MOUSE, 1, "mouse"}, {PCIC_INPUTDEV, PCIS_INPUTDEV_SCANNER, 1, "scanner"}, {PCIC_INPUTDEV, PCIS_INPUTDEV_GAMEPORT, 1, "gameport"}, {PCIC_DOCKING, -1, 1, "docking station"}, {PCIC_PROCESSOR, -1, 1, "processor"}, {PCIC_SERIALBUS, -1, 1, "serial bus"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_FW, 1, "FireWire"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_ACCESS, 1, "AccessBus"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_SSA, 1, "SSA"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_USB, 1, "USB"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_FC, 1, "Fibre Channel"}, {PCIC_SERIALBUS, PCIS_SERIALBUS_SMBUS, 0, "SMBus"}, {PCIC_WIRELESS, -1, 1, "wireless controller"}, {PCIC_WIRELESS, PCIS_WIRELESS_IRDA, 1, "iRDA"}, {PCIC_WIRELESS, PCIS_WIRELESS_IR, 1, "IR"}, {PCIC_WIRELESS, PCIS_WIRELESS_RF, 1, "RF"}, {PCIC_INTELLIIO, -1, 1, "intelligent I/O controller"}, {PCIC_INTELLIIO, PCIS_INTELLIIO_I2O, 1, "I2O"}, {PCIC_SATCOM, -1, 1, "satellite communication"}, {PCIC_SATCOM, PCIS_SATCOM_TV, 1, "sat TV"}, {PCIC_SATCOM, PCIS_SATCOM_AUDIO, 1, "sat audio"}, {PCIC_SATCOM, PCIS_SATCOM_VOICE, 1, "sat voice"}, {PCIC_SATCOM, PCIS_SATCOM_DATA, 1, "sat data"}, {PCIC_CRYPTO, -1, 1, "encrypt/decrypt"}, {PCIC_CRYPTO, PCIS_CRYPTO_NETCOMP, 1, "network/computer crypto"}, {PCIC_CRYPTO, PCIS_CRYPTO_ENTERTAIN, 1, "entertainment crypto"}, {PCIC_DASP, -1, 0, "dasp"}, {PCIC_DASP, PCIS_DASP_DPIO, 1, "DPIO module"}, {PCIC_DASP, PCIS_DASP_PERFCNTRS, 1, "performance counters"}, {PCIC_DASP, PCIS_DASP_COMM_SYNC, 1, "communication synchronizer"}, {PCIC_DASP, PCIS_DASP_MGMT_CARD, 1, "signal processing management"}, {PCIC_INSTRUMENT, -1, 0, "non-essential instrumentation"}, {0, 0, 0, NULL} }; void pci_probe_nomatch(device_t dev, device_t child) { int i, report; const char *cp, *scp; char *device; /* * Look for a listing for this device in a loaded device database. */ report = 1; if ((device = pci_describe_device(child)) != NULL) { device_printf(dev, "<%s>", device); free(device, M_DEVBUF); } else { /* * Scan the class/subclass descriptions for a general * description. */ cp = "unknown"; scp = NULL; for (i = 0; pci_nomatch_tab[i].desc != NULL; i++) { if (pci_nomatch_tab[i].class == pci_get_class(child)) { if (pci_nomatch_tab[i].subclass == -1) { cp = pci_nomatch_tab[i].desc; report = pci_nomatch_tab[i].report; } else if (pci_nomatch_tab[i].subclass == pci_get_subclass(child)) { scp = pci_nomatch_tab[i].desc; report = pci_nomatch_tab[i].report; } } } if (report || bootverbose) { device_printf(dev, "<%s%s%s>", cp ? cp : "", ((cp != NULL) && (scp != NULL)) ? ", " : "", scp ? scp : ""); } } if (report || bootverbose) { printf(" at device %d.%d (no driver attached)\n", pci_get_slot(child), pci_get_function(child)); } pci_cfg_save(child, device_get_ivars(child), 1); } void pci_child_detached(device_t dev, device_t child) { struct pci_devinfo *dinfo; struct resource_list *rl; dinfo = device_get_ivars(child); rl = &dinfo->resources; /* * Have to deallocate IRQs before releasing any MSI messages and * have to release MSI messages before deallocating any memory * BARs. */ if (resource_list_release_active(rl, dev, child, SYS_RES_IRQ) != 0) pci_printf(&dinfo->cfg, "Device leaked IRQ resources\n"); if (dinfo->cfg.msi.msi_alloc != 0 || dinfo->cfg.msix.msix_alloc != 0) { if (dinfo->cfg.msi.msi_alloc != 0) pci_printf(&dinfo->cfg, "Device leaked %d MSI " "vectors\n", dinfo->cfg.msi.msi_alloc); else pci_printf(&dinfo->cfg, "Device leaked %d MSI-X " "vectors\n", dinfo->cfg.msix.msix_alloc); (void)pci_release_msi(child); } if (resource_list_release_active(rl, dev, child, SYS_RES_MEMORY) != 0) pci_printf(&dinfo->cfg, "Device leaked memory resources\n"); if (resource_list_release_active(rl, dev, child, SYS_RES_IOPORT) != 0) pci_printf(&dinfo->cfg, "Device leaked I/O resources\n"); #ifdef PCI_RES_BUS if (resource_list_release_active(rl, dev, child, PCI_RES_BUS) != 0) pci_printf(&dinfo->cfg, "Device leaked PCI bus numbers\n"); #endif pci_cfg_save(child, dinfo, 1); } /* * Parse the PCI device database, if loaded, and return a pointer to a * description of the device. * * The database is flat text formatted as follows: * * Any line not in a valid format is ignored. * Lines are terminated with newline '\n' characters. * * A VENDOR line consists of the 4 digit (hex) vendor code, a TAB, then * the vendor name. * * A DEVICE line is entered immediately below the corresponding VENDOR ID. * - devices cannot be listed without a corresponding VENDOR line. * A DEVICE line consists of a TAB, the 4 digit (hex) device code, * another TAB, then the device name. */ /* * Assuming (ptr) points to the beginning of a line in the database, * return the vendor or device and description of the next entry. * The value of (vendor) or (device) inappropriate for the entry type * is set to -1. Returns nonzero at the end of the database. * * Note that this is slightly unrobust in the face of corrupt data; * we attempt to safeguard against this by spamming the end of the * database with a newline when we initialise. */ static int pci_describe_parse_line(char **ptr, int *vendor, int *device, char **desc) { char *cp = *ptr; int left; *device = -1; *vendor = -1; **desc = '\0'; for (;;) { left = pci_vendordata_size - (cp - pci_vendordata); if (left <= 0) { *ptr = cp; return(1); } /* vendor entry? */ if (*cp != '\t' && sscanf(cp, "%x\t%80[^\n]", vendor, *desc) == 2) break; /* device entry? */ if (*cp == '\t' && sscanf(cp, "%x\t%80[^\n]", device, *desc) == 2) break; /* skip to next line */ while (*cp != '\n' && left > 0) { cp++; left--; } if (*cp == '\n') { cp++; left--; } } /* skip to next line */ while (*cp != '\n' && left > 0) { cp++; left--; } if (*cp == '\n' && left > 0) cp++; *ptr = cp; return(0); } static char * pci_describe_device(device_t dev) { int vendor, device; char *desc, *vp, *dp, *line; desc = vp = dp = NULL; /* * If we have no vendor data, we can't do anything. */ if (pci_vendordata == NULL) goto out; /* * Scan the vendor data looking for this device */ line = pci_vendordata; if ((vp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL) goto out; for (;;) { if (pci_describe_parse_line(&line, &vendor, &device, &vp)) goto out; if (vendor == pci_get_vendor(dev)) break; } if ((dp = malloc(80, M_DEVBUF, M_NOWAIT)) == NULL) goto out; for (;;) { if (pci_describe_parse_line(&line, &vendor, &device, &dp)) { *dp = 0; break; } if (vendor != -1) { *dp = 0; break; } if (device == pci_get_device(dev)) break; } if (dp[0] == '\0') snprintf(dp, 80, "0x%x", pci_get_device(dev)); if ((desc = malloc(strlen(vp) + strlen(dp) + 3, M_DEVBUF, M_NOWAIT)) != NULL) sprintf(desc, "%s, %s", vp, dp); out: if (vp != NULL) free(vp, M_DEVBUF); if (dp != NULL) free(dp, M_DEVBUF); return(desc); } int pci_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) { struct pci_devinfo *dinfo; pcicfgregs *cfg; dinfo = device_get_ivars(child); cfg = &dinfo->cfg; switch (which) { case PCI_IVAR_ETHADDR: /* * The generic accessor doesn't deal with failure, so * we set the return value, then return an error. */ *((uint8_t **) result) = NULL; return (EINVAL); case PCI_IVAR_SUBVENDOR: *result = cfg->subvendor; break; case PCI_IVAR_SUBDEVICE: *result = cfg->subdevice; break; case PCI_IVAR_VENDOR: *result = cfg->vendor; break; case PCI_IVAR_DEVICE: *result = cfg->device; break; case PCI_IVAR_DEVID: *result = (cfg->device << 16) | cfg->vendor; break; case PCI_IVAR_CLASS: *result = cfg->baseclass; break; case PCI_IVAR_SUBCLASS: *result = cfg->subclass; break; case PCI_IVAR_PROGIF: *result = cfg->progif; break; case PCI_IVAR_REVID: *result = cfg->revid; break; case PCI_IVAR_INTPIN: *result = cfg->intpin; break; case PCI_IVAR_IRQ: *result = cfg->intline; break; case PCI_IVAR_DOMAIN: *result = cfg->domain; break; case PCI_IVAR_BUS: *result = cfg->bus; break; case PCI_IVAR_SLOT: *result = cfg->slot; break; case PCI_IVAR_FUNCTION: *result = cfg->func; break; case PCI_IVAR_CMDREG: *result = cfg->cmdreg; break; case PCI_IVAR_CACHELNSZ: *result = cfg->cachelnsz; break; case PCI_IVAR_MINGNT: if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) { *result = -1; return (EINVAL); } *result = cfg->mingnt; break; case PCI_IVAR_MAXLAT: if (cfg->hdrtype != PCIM_HDRTYPE_NORMAL) { *result = -1; return (EINVAL); } *result = cfg->maxlat; break; case PCI_IVAR_LATTIMER: *result = cfg->lattimer; break; default: return (ENOENT); } return (0); } int pci_write_ivar(device_t dev, device_t child, int which, uintptr_t value) { struct pci_devinfo *dinfo; dinfo = device_get_ivars(child); switch (which) { case PCI_IVAR_INTPIN: dinfo->cfg.intpin = value; return (0); case PCI_IVAR_ETHADDR: case PCI_IVAR_SUBVENDOR: case PCI_IVAR_SUBDEVICE: case PCI_IVAR_VENDOR: case PCI_IVAR_DEVICE: case PCI_IVAR_DEVID: case PCI_IVAR_CLASS: case PCI_IVAR_SUBCLASS: case PCI_IVAR_PROGIF: case PCI_IVAR_REVID: case PCI_IVAR_IRQ: case PCI_IVAR_DOMAIN: case PCI_IVAR_BUS: case PCI_IVAR_SLOT: case PCI_IVAR_FUNCTION: return (EINVAL); /* disallow for now */ default: return (ENOENT); } } #include "opt_ddb.h" #ifdef DDB #include #include /* * List resources based on pci map registers, used for within ddb */ -DB_SHOW_COMMAND(pciregs, db_pci_dump) +DB_SHOW_COMMAND_FLAGS(pciregs, db_pci_dump, DB_CMD_MEMSAFE) { struct pci_devinfo *dinfo; struct devlist *devlist_head; struct pci_conf *p; const char *name; int i, error, none_count; none_count = 0; /* get the head of the device queue */ devlist_head = &pci_devq; /* * Go through the list of devices and print out devices */ for (error = 0, i = 0, dinfo = STAILQ_FIRST(devlist_head); (dinfo != NULL) && (error == 0) && (i < pci_numdevs) && !db_pager_quit; dinfo = STAILQ_NEXT(dinfo, pci_links), i++) { /* Populate pd_name and pd_unit */ name = NULL; if (dinfo->cfg.dev) name = device_get_name(dinfo->cfg.dev); p = &dinfo->conf; db_printf("%s%d@pci%d:%d:%d:%d:\tclass=0x%06x card=0x%08x " "chip=0x%08x rev=0x%02x hdr=0x%02x\n", (name && *name) ? name : "none", (name && *name) ? (int)device_get_unit(dinfo->cfg.dev) : none_count++, p->pc_sel.pc_domain, p->pc_sel.pc_bus, p->pc_sel.pc_dev, p->pc_sel.pc_func, (p->pc_class << 16) | (p->pc_subclass << 8) | p->pc_progif, (p->pc_subdevice << 16) | p->pc_subvendor, (p->pc_device << 16) | p->pc_vendor, p->pc_revid, p->pc_hdr); } } #endif /* DDB */ struct resource * pci_reserve_map(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int num, u_int flags) { struct pci_devinfo *dinfo = device_get_ivars(child); struct resource_list *rl = &dinfo->resources; struct resource *res; struct pci_map *pm; uint16_t cmd; pci_addr_t map, testval; int mapsize; res = NULL; /* If rid is managed by EA, ignore it */ if (pci_ea_is_enabled(child, *rid)) goto out; pm = pci_find_bar(child, *rid); if (pm != NULL) { /* This is a BAR that we failed to allocate earlier. */ mapsize = pm->pm_size; map = pm->pm_value; } else { /* * Weed out the bogons, and figure out how large the * BAR/map is. BARs that read back 0 here are bogus * and unimplemented. Note: atapci in legacy mode are * special and handled elsewhere in the code. If you * have a atapci device in legacy mode and it fails * here, that other code is broken. */ pci_read_bar(child, *rid, &map, &testval, NULL); /* * Determine the size of the BAR and ignore BARs with a size * of 0. Device ROM BARs use a different mask value. */ if (PCIR_IS_BIOS(&dinfo->cfg, *rid)) mapsize = pci_romsize(testval); else mapsize = pci_mapsize(testval); if (mapsize == 0) goto out; pm = pci_add_bar(child, *rid, map, mapsize); } if (PCI_BAR_MEM(map) || PCIR_IS_BIOS(&dinfo->cfg, *rid)) { if (type != SYS_RES_MEMORY) { if (bootverbose) device_printf(dev, "child %s requested type %d for rid %#x," " but the BAR says it is an memio\n", device_get_nameunit(child), type, *rid); goto out; } } else { if (type != SYS_RES_IOPORT) { if (bootverbose) device_printf(dev, "child %s requested type %d for rid %#x," " but the BAR says it is an ioport\n", device_get_nameunit(child), type, *rid); goto out; } } /* * For real BARs, we need to override the size that * the driver requests, because that's what the BAR * actually uses and we would otherwise have a * situation where we might allocate the excess to * another driver, which won't work. */ count = ((pci_addr_t)1 << mapsize) * num; if (RF_ALIGNMENT(flags) < mapsize) flags = (flags & ~RF_ALIGNMENT_MASK) | RF_ALIGNMENT_LOG2(mapsize); if (PCI_BAR_MEM(map) && (map & PCIM_BAR_MEM_PREFETCH)) flags |= RF_PREFETCHABLE; /* * Allocate enough resource, and then write back the * appropriate BAR for that resource. */ resource_list_add(rl, type, *rid, start, end, count); res = resource_list_reserve(rl, dev, child, type, rid, start, end, count, flags & ~RF_ACTIVE); if (res == NULL) { resource_list_delete(rl, type, *rid); device_printf(child, "%#jx bytes of rid %#x res %d failed (%#jx, %#jx).\n", count, *rid, type, start, end); goto out; } if (bootverbose) device_printf(child, "Lazy allocation of %#jx bytes rid %#x type %d at %#jx\n", count, *rid, type, rman_get_start(res)); /* Disable decoding via the CMD register before updating the BAR */ cmd = pci_read_config(child, PCIR_COMMAND, 2); pci_write_config(child, PCIR_COMMAND, cmd & ~(PCI_BAR_MEM(map) ? PCIM_CMD_MEMEN : PCIM_CMD_PORTEN), 2); map = rman_get_start(res); pci_write_bar(child, pm, map); /* Restore the original value of the CMD register */ pci_write_config(child, PCIR_COMMAND, cmd, 2); out: return (res); } struct resource * pci_alloc_multi_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_long num, u_int flags) { struct pci_devinfo *dinfo; struct resource_list *rl; struct resource_list_entry *rle; struct resource *res; pcicfgregs *cfg; /* * Perform lazy resource allocation */ dinfo = device_get_ivars(child); rl = &dinfo->resources; cfg = &dinfo->cfg; switch (type) { #if defined(NEW_PCIB) && defined(PCI_RES_BUS) case PCI_RES_BUS: return (pci_alloc_secbus(dev, child, rid, start, end, count, flags)); #endif case SYS_RES_IRQ: /* * Can't alloc legacy interrupt once MSI messages have * been allocated. */ if (*rid == 0 && (cfg->msi.msi_alloc > 0 || cfg->msix.msix_alloc > 0)) return (NULL); /* * If the child device doesn't have an interrupt * routed and is deserving of an interrupt, try to * assign it one. */ if (*rid == 0 && !PCI_INTERRUPT_VALID(cfg->intline) && (cfg->intpin != 0)) pci_assign_interrupt(dev, child, 0); break; case SYS_RES_IOPORT: case SYS_RES_MEMORY: #ifdef NEW_PCIB /* * PCI-PCI bridge I/O window resources are not BARs. * For those allocations just pass the request up the * tree. */ if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE) { switch (*rid) { case PCIR_IOBASEL_1: case PCIR_MEMBASE_1: case PCIR_PMBASEL_1: /* * XXX: Should we bother creating a resource * list entry? */ return (bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags)); } } #endif /* Reserve resources for this BAR if needed. */ rle = resource_list_find(rl, type, *rid); if (rle == NULL) { res = pci_reserve_map(dev, child, type, rid, start, end, count, num, flags); if (res == NULL) return (NULL); } } return (resource_list_alloc(rl, dev, child, type, rid, start, end, count, flags)); } struct resource * pci_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { #ifdef PCI_IOV struct pci_devinfo *dinfo; #endif if (device_get_parent(child) != dev) return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child, type, rid, start, end, count, flags)); #ifdef PCI_IOV dinfo = device_get_ivars(child); if (dinfo->cfg.flags & PCICFG_VF) { switch (type) { /* VFs can't have I/O BARs. */ case SYS_RES_IOPORT: return (NULL); case SYS_RES_MEMORY: return (pci_vf_alloc_mem_resource(dev, child, rid, start, end, count, flags)); } /* Fall through for other types of resource allocations. */ } #endif return (pci_alloc_multi_resource(dev, child, type, rid, start, end, count, 1, flags)); } int pci_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct pci_devinfo *dinfo; struct resource_list *rl; pcicfgregs *cfg __unused; if (device_get_parent(child) != dev) return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child, type, rid, r)); dinfo = device_get_ivars(child); cfg = &dinfo->cfg; #ifdef PCI_IOV if (cfg->flags & PCICFG_VF) { switch (type) { /* VFs can't have I/O BARs. */ case SYS_RES_IOPORT: return (EDOOFUS); case SYS_RES_MEMORY: return (pci_vf_release_mem_resource(dev, child, rid, r)); } /* Fall through for other types of resource allocations. */ } #endif #ifdef NEW_PCIB /* * PCI-PCI bridge I/O window resources are not BARs. For * those allocations just pass the request up the tree. */ if (cfg->hdrtype == PCIM_HDRTYPE_BRIDGE && (type == SYS_RES_IOPORT || type == SYS_RES_MEMORY)) { switch (rid) { case PCIR_IOBASEL_1: case PCIR_MEMBASE_1: case PCIR_PMBASEL_1: return (bus_generic_release_resource(dev, child, type, rid, r)); } } #endif rl = &dinfo->resources; return (resource_list_release(rl, dev, child, type, rid, r)); } int pci_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct pci_devinfo *dinfo; int error; error = bus_generic_activate_resource(dev, child, type, rid, r); if (error) return (error); /* Enable decoding in the command register when activating BARs. */ if (device_get_parent(child) == dev) { /* Device ROMs need their decoding explicitly enabled. */ dinfo = device_get_ivars(child); if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid)) pci_write_bar(child, pci_find_bar(child, rid), rman_get_start(r) | PCIM_BIOS_ENABLE); switch (type) { case SYS_RES_IOPORT: case SYS_RES_MEMORY: error = PCI_ENABLE_IO(dev, child, type); break; } } return (error); } int pci_deactivate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct pci_devinfo *dinfo; int error; error = bus_generic_deactivate_resource(dev, child, type, rid, r); if (error) return (error); /* Disable decoding for device ROMs. */ if (device_get_parent(child) == dev) { dinfo = device_get_ivars(child); if (type == SYS_RES_MEMORY && PCIR_IS_BIOS(&dinfo->cfg, rid)) pci_write_bar(child, pci_find_bar(child, rid), rman_get_start(r)); } return (0); } void pci_child_deleted(device_t dev, device_t child) { struct resource_list_entry *rle; struct resource_list *rl; struct pci_devinfo *dinfo; dinfo = device_get_ivars(child); rl = &dinfo->resources; EVENTHANDLER_INVOKE(pci_delete_device, child); /* Turn off access to resources we're about to free */ if (bus_child_present(child) != 0) { pci_write_config(child, PCIR_COMMAND, pci_read_config(child, PCIR_COMMAND, 2) & ~(PCIM_CMD_MEMEN | PCIM_CMD_PORTEN), 2); pci_disable_busmaster(child); } /* Free all allocated resources */ STAILQ_FOREACH(rle, rl, link) { if (rle->res) { if (rman_get_flags(rle->res) & RF_ACTIVE || resource_list_busy(rl, rle->type, rle->rid)) { pci_printf(&dinfo->cfg, "Resource still owned, oops. " "(type=%d, rid=%d, addr=%lx)\n", rle->type, rle->rid, rman_get_start(rle->res)); bus_release_resource(child, rle->type, rle->rid, rle->res); } resource_list_unreserve(rl, dev, child, rle->type, rle->rid); } } resource_list_free(rl); pci_freecfg(dinfo); } void pci_delete_resource(device_t dev, device_t child, int type, int rid) { struct pci_devinfo *dinfo; struct resource_list *rl; struct resource_list_entry *rle; if (device_get_parent(child) != dev) return; dinfo = device_get_ivars(child); rl = &dinfo->resources; rle = resource_list_find(rl, type, rid); if (rle == NULL) return; if (rle->res) { if (rman_get_flags(rle->res) & RF_ACTIVE || resource_list_busy(rl, type, rid)) { device_printf(dev, "delete_resource: " "Resource still owned by child, oops. " "(type=%d, rid=%d, addr=%jx)\n", type, rid, rman_get_start(rle->res)); return; } resource_list_unreserve(rl, dev, child, type, rid); } resource_list_delete(rl, type, rid); } struct resource_list * pci_get_resource_list (device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); return (&dinfo->resources); } #ifdef IOMMU bus_dma_tag_t pci_get_dma_tag(device_t bus, device_t dev) { bus_dma_tag_t tag; struct pci_softc *sc; if (device_get_parent(dev) == bus) { /* try iommu and return if it works */ tag = iommu_get_dma_tag(bus, dev); } else tag = NULL; if (tag == NULL) { sc = device_get_softc(bus); tag = sc->sc_dma_tag; } return (tag); } #else bus_dma_tag_t pci_get_dma_tag(device_t bus, device_t dev) { struct pci_softc *sc = device_get_softc(bus); return (sc->sc_dma_tag); } #endif uint32_t pci_read_config_method(device_t dev, device_t child, int reg, int width) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; #ifdef PCI_IOV /* * SR-IOV VFs don't implement the VID or DID registers, so we have to * emulate them here. */ if (cfg->flags & PCICFG_VF) { if (reg == PCIR_VENDOR) { switch (width) { case 4: return (cfg->device << 16 | cfg->vendor); case 2: return (cfg->vendor); case 1: return (cfg->vendor & 0xff); default: return (0xffffffff); } } else if (reg == PCIR_DEVICE) { switch (width) { /* Note that an unaligned 4-byte read is an error. */ case 2: return (cfg->device); case 1: return (cfg->device & 0xff); default: return (0xffffffff); } } } #endif return (PCIB_READ_CONFIG(device_get_parent(dev), cfg->bus, cfg->slot, cfg->func, reg, width)); } void pci_write_config_method(device_t dev, device_t child, int reg, uint32_t val, int width) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; PCIB_WRITE_CONFIG(device_get_parent(dev), cfg->bus, cfg->slot, cfg->func, reg, val, width); } int pci_child_location_method(device_t dev, device_t child, struct sbuf *sb) { sbuf_printf(sb, "slot=%d function=%d dbsf=pci%d:%d:%d:%d", pci_get_slot(child), pci_get_function(child), pci_get_domain(child), pci_get_bus(child), pci_get_slot(child), pci_get_function(child)); return (0); } int pci_child_pnpinfo_method(device_t dev, device_t child, struct sbuf *sb) { struct pci_devinfo *dinfo; pcicfgregs *cfg; dinfo = device_get_ivars(child); cfg = &dinfo->cfg; sbuf_printf(sb, "vendor=0x%04x device=0x%04x subvendor=0x%04x " "subdevice=0x%04x class=0x%02x%02x%02x", cfg->vendor, cfg->device, cfg->subvendor, cfg->subdevice, cfg->baseclass, cfg->subclass, cfg->progif); return (0); } int pci_get_device_path_method(device_t bus, device_t child, const char *locator, struct sbuf *sb) { device_t parent = device_get_parent(bus); int rv; if (strcmp(locator, BUS_LOCATOR_UEFI) == 0) { rv = bus_generic_get_device_path(parent, bus, locator, sb); if (rv == 0) { sbuf_printf(sb, "/Pci(0x%x,0x%x)", pci_get_slot(child), pci_get_function(child)); } return (0); } return (bus_generic_get_device_path(bus, child, locator, sb)); } int pci_assign_interrupt_method(device_t dev, device_t child) { struct pci_devinfo *dinfo = device_get_ivars(child); pcicfgregs *cfg = &dinfo->cfg; return (PCIB_ROUTE_INTERRUPT(device_get_parent(dev), child, cfg->intpin)); } static void pci_lookup(void *arg, const char *name, device_t *dev) { long val; char *end; int domain, bus, slot, func; if (*dev != NULL) return; /* * Accept pciconf-style selectors of either pciD:B:S:F or * pciB:S:F. In the latter case, the domain is assumed to * be zero. */ if (strncmp(name, "pci", 3) != 0) return; val = strtol(name + 3, &end, 10); if (val < 0 || val > INT_MAX || *end != ':') return; domain = val; val = strtol(end + 1, &end, 10); if (val < 0 || val > INT_MAX || *end != ':') return; bus = val; val = strtol(end + 1, &end, 10); if (val < 0 || val > INT_MAX) return; slot = val; if (*end == ':') { val = strtol(end + 1, &end, 10); if (val < 0 || val > INT_MAX || *end != '\0') return; func = val; } else if (*end == '\0') { func = slot; slot = bus; bus = domain; domain = 0; } else return; if (domain > PCI_DOMAINMAX || bus > PCI_BUSMAX || slot > PCI_SLOTMAX || func > PCIE_ARI_FUNCMAX || (slot != 0 && func > PCI_FUNCMAX)) return; *dev = pci_find_dbsf(domain, bus, slot, func); } static int pci_modevent(module_t mod, int what, void *arg) { static struct cdev *pci_cdev; static eventhandler_tag tag; switch (what) { case MOD_LOAD: STAILQ_INIT(&pci_devq); pci_generation = 0; pci_cdev = make_dev(&pcicdev, 0, UID_ROOT, GID_WHEEL, 0644, "pci"); pci_load_vendor_data(); tag = EVENTHANDLER_REGISTER(dev_lookup, pci_lookup, NULL, 1000); break; case MOD_UNLOAD: if (tag != NULL) EVENTHANDLER_DEREGISTER(dev_lookup, tag); destroy_dev(pci_cdev); break; } return (0); } static void pci_cfg_restore_pcie(device_t dev, struct pci_devinfo *dinfo) { #define WREG(n, v) pci_write_config(dev, pos + (n), (v), 2) struct pcicfg_pcie *cfg; int version, pos; cfg = &dinfo->cfg.pcie; pos = cfg->pcie_location; version = cfg->pcie_flags & PCIEM_FLAGS_VERSION; WREG(PCIER_DEVICE_CTL, cfg->pcie_device_ctl); if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || cfg->pcie_type == PCIEM_TYPE_ENDPOINT || cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT) WREG(PCIER_LINK_CTL, cfg->pcie_link_ctl); if (version > 1 || (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || (cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT && (cfg->pcie_flags & PCIEM_FLAGS_SLOT)))) WREG(PCIER_SLOT_CTL, cfg->pcie_slot_ctl); if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || cfg->pcie_type == PCIEM_TYPE_ROOT_EC) WREG(PCIER_ROOT_CTL, cfg->pcie_root_ctl); if (version > 1) { WREG(PCIER_DEVICE_CTL2, cfg->pcie_device_ctl2); WREG(PCIER_LINK_CTL2, cfg->pcie_link_ctl2); WREG(PCIER_SLOT_CTL2, cfg->pcie_slot_ctl2); } #undef WREG } static void pci_cfg_restore_pcix(device_t dev, struct pci_devinfo *dinfo) { pci_write_config(dev, dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND, dinfo->cfg.pcix.pcix_command, 2); } void pci_cfg_restore(device_t dev, struct pci_devinfo *dinfo) { /* * Restore the device to full power mode. We must do this * before we restore the registers because moving from D3 to * D0 will cause the chip's BARs and some other registers to * be reset to some unknown power on reset values. Cut down * the noise on boot by doing nothing if we are already in * state D0. */ if (pci_get_powerstate(dev) != PCI_POWERSTATE_D0) pci_set_powerstate(dev, PCI_POWERSTATE_D0); pci_write_config(dev, PCIR_INTLINE, dinfo->cfg.intline, 1); pci_write_config(dev, PCIR_INTPIN, dinfo->cfg.intpin, 1); pci_write_config(dev, PCIR_CACHELNSZ, dinfo->cfg.cachelnsz, 1); pci_write_config(dev, PCIR_LATTIMER, dinfo->cfg.lattimer, 1); pci_write_config(dev, PCIR_PROGIF, dinfo->cfg.progif, 1); pci_write_config(dev, PCIR_REVID, dinfo->cfg.revid, 1); switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_NORMAL: pci_write_config(dev, PCIR_MINGNT, dinfo->cfg.mingnt, 1); pci_write_config(dev, PCIR_MAXLAT, dinfo->cfg.maxlat, 1); break; case PCIM_HDRTYPE_BRIDGE: pci_write_config(dev, PCIR_SECLAT_1, dinfo->cfg.bridge.br_seclat, 1); pci_write_config(dev, PCIR_SUBBUS_1, dinfo->cfg.bridge.br_subbus, 1); pci_write_config(dev, PCIR_SECBUS_1, dinfo->cfg.bridge.br_secbus, 1); pci_write_config(dev, PCIR_PRIBUS_1, dinfo->cfg.bridge.br_pribus, 1); pci_write_config(dev, PCIR_BRIDGECTL_1, dinfo->cfg.bridge.br_control, 2); break; case PCIM_HDRTYPE_CARDBUS: pci_write_config(dev, PCIR_SECLAT_2, dinfo->cfg.bridge.br_seclat, 1); pci_write_config(dev, PCIR_SUBBUS_2, dinfo->cfg.bridge.br_subbus, 1); pci_write_config(dev, PCIR_SECBUS_2, dinfo->cfg.bridge.br_secbus, 1); pci_write_config(dev, PCIR_PRIBUS_2, dinfo->cfg.bridge.br_pribus, 1); pci_write_config(dev, PCIR_BRIDGECTL_2, dinfo->cfg.bridge.br_control, 2); break; } pci_restore_bars(dev); if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_BRIDGE) pci_write_config(dev, PCIR_COMMAND, dinfo->cfg.cmdreg, 2); /* * Restore extended capabilities for PCI-Express and PCI-X */ if (dinfo->cfg.pcie.pcie_location != 0) pci_cfg_restore_pcie(dev, dinfo); if (dinfo->cfg.pcix.pcix_location != 0) pci_cfg_restore_pcix(dev, dinfo); /* Restore MSI and MSI-X configurations if they are present. */ if (dinfo->cfg.msi.msi_location != 0) pci_resume_msi(dev); if (dinfo->cfg.msix.msix_location != 0) pci_resume_msix(dev); #ifdef PCI_IOV if (dinfo->cfg.iov != NULL) pci_iov_cfg_restore(dev, dinfo); #endif } static void pci_cfg_save_pcie(device_t dev, struct pci_devinfo *dinfo) { #define RREG(n) pci_read_config(dev, pos + (n), 2) struct pcicfg_pcie *cfg; int version, pos; cfg = &dinfo->cfg.pcie; pos = cfg->pcie_location; cfg->pcie_flags = RREG(PCIER_FLAGS); version = cfg->pcie_flags & PCIEM_FLAGS_VERSION; cfg->pcie_device_ctl = RREG(PCIER_DEVICE_CTL); if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || cfg->pcie_type == PCIEM_TYPE_ENDPOINT || cfg->pcie_type == PCIEM_TYPE_LEGACY_ENDPOINT) cfg->pcie_link_ctl = RREG(PCIER_LINK_CTL); if (version > 1 || (cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || (cfg->pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT && (cfg->pcie_flags & PCIEM_FLAGS_SLOT)))) cfg->pcie_slot_ctl = RREG(PCIER_SLOT_CTL); if (version > 1 || cfg->pcie_type == PCIEM_TYPE_ROOT_PORT || cfg->pcie_type == PCIEM_TYPE_ROOT_EC) cfg->pcie_root_ctl = RREG(PCIER_ROOT_CTL); if (version > 1) { cfg->pcie_device_ctl2 = RREG(PCIER_DEVICE_CTL2); cfg->pcie_link_ctl2 = RREG(PCIER_LINK_CTL2); cfg->pcie_slot_ctl2 = RREG(PCIER_SLOT_CTL2); } #undef RREG } static void pci_cfg_save_pcix(device_t dev, struct pci_devinfo *dinfo) { dinfo->cfg.pcix.pcix_command = pci_read_config(dev, dinfo->cfg.pcix.pcix_location + PCIXR_COMMAND, 2); } void pci_cfg_save(device_t dev, struct pci_devinfo *dinfo, int setstate) { uint32_t cls; int ps; /* * Some drivers apparently write to these registers w/o updating our * cached copy. No harm happens if we update the copy, so do so here * so we can restore them. The COMMAND register is modified by the * bus w/o updating the cache. This should represent the normally * writable portion of the 'defined' part of type 0/1/2 headers. */ dinfo->cfg.vendor = pci_read_config(dev, PCIR_VENDOR, 2); dinfo->cfg.device = pci_read_config(dev, PCIR_DEVICE, 2); dinfo->cfg.cmdreg = pci_read_config(dev, PCIR_COMMAND, 2); dinfo->cfg.intline = pci_read_config(dev, PCIR_INTLINE, 1); dinfo->cfg.intpin = pci_read_config(dev, PCIR_INTPIN, 1); dinfo->cfg.cachelnsz = pci_read_config(dev, PCIR_CACHELNSZ, 1); dinfo->cfg.lattimer = pci_read_config(dev, PCIR_LATTIMER, 1); dinfo->cfg.baseclass = pci_read_config(dev, PCIR_CLASS, 1); dinfo->cfg.subclass = pci_read_config(dev, PCIR_SUBCLASS, 1); dinfo->cfg.progif = pci_read_config(dev, PCIR_PROGIF, 1); dinfo->cfg.revid = pci_read_config(dev, PCIR_REVID, 1); switch (dinfo->cfg.hdrtype & PCIM_HDRTYPE) { case PCIM_HDRTYPE_NORMAL: dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_0, 2); dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_0, 2); dinfo->cfg.mingnt = pci_read_config(dev, PCIR_MINGNT, 1); dinfo->cfg.maxlat = pci_read_config(dev, PCIR_MAXLAT, 1); break; case PCIM_HDRTYPE_BRIDGE: dinfo->cfg.bridge.br_seclat = pci_read_config(dev, PCIR_SECLAT_1, 1); dinfo->cfg.bridge.br_subbus = pci_read_config(dev, PCIR_SUBBUS_1, 1); dinfo->cfg.bridge.br_secbus = pci_read_config(dev, PCIR_SECBUS_1, 1); dinfo->cfg.bridge.br_pribus = pci_read_config(dev, PCIR_PRIBUS_1, 1); dinfo->cfg.bridge.br_control = pci_read_config(dev, PCIR_BRIDGECTL_1, 2); break; case PCIM_HDRTYPE_CARDBUS: dinfo->cfg.bridge.br_seclat = pci_read_config(dev, PCIR_SECLAT_2, 1); dinfo->cfg.bridge.br_subbus = pci_read_config(dev, PCIR_SUBBUS_2, 1); dinfo->cfg.bridge.br_secbus = pci_read_config(dev, PCIR_SECBUS_2, 1); dinfo->cfg.bridge.br_pribus = pci_read_config(dev, PCIR_PRIBUS_2, 1); dinfo->cfg.bridge.br_control = pci_read_config(dev, PCIR_BRIDGECTL_2, 2); dinfo->cfg.subvendor = pci_read_config(dev, PCIR_SUBVEND_2, 2); dinfo->cfg.subdevice = pci_read_config(dev, PCIR_SUBDEV_2, 2); break; } if (dinfo->cfg.pcie.pcie_location != 0) pci_cfg_save_pcie(dev, dinfo); if (dinfo->cfg.pcix.pcix_location != 0) pci_cfg_save_pcix(dev, dinfo); #ifdef PCI_IOV if (dinfo->cfg.iov != NULL) pci_iov_cfg_save(dev, dinfo); #endif /* * don't set the state for display devices, base peripherals and * memory devices since bad things happen when they are powered down. * We should (a) have drivers that can easily detach and (b) use * generic drivers for these devices so that some device actually * attaches. We need to make sure that when we implement (a) we don't * power the device down on a reattach. */ cls = pci_get_class(dev); if (!setstate) return; switch (pci_do_power_nodriver) { case 0: /* NO powerdown at all */ return; case 1: /* Conservative about what to power down */ if (cls == PCIC_STORAGE) return; /*FALLTHROUGH*/ case 2: /* Aggressive about what to power down */ if (cls == PCIC_DISPLAY || cls == PCIC_MEMORY || cls == PCIC_BASEPERIPH) return; /*FALLTHROUGH*/ case 3: /* Power down everything */ break; } /* * PCI spec says we can only go into D3 state from D0 state. * Transition from D[12] into D0 before going to D3 state. */ ps = pci_get_powerstate(dev); if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3) pci_set_powerstate(dev, PCI_POWERSTATE_D0); if (pci_get_powerstate(dev) != PCI_POWERSTATE_D3) pci_set_powerstate(dev, PCI_POWERSTATE_D3); } /* Wrapper APIs suitable for device driver use. */ void pci_save_state(device_t dev) { struct pci_devinfo *dinfo; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); } void pci_restore_state(device_t dev) { struct pci_devinfo *dinfo; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); } static int pci_get_id_method(device_t dev, device_t child, enum pci_id_type type, uintptr_t *id) { return (PCIB_GET_ID(device_get_parent(dev), child, type, id)); } /* Find the upstream port of a given PCI device in a root complex. */ device_t pci_find_pcie_root_port(device_t dev) { struct pci_devinfo *dinfo; devclass_t pci_class; device_t pcib, bus; pci_class = devclass_find("pci"); KASSERT(device_get_devclass(device_get_parent(dev)) == pci_class, ("%s: non-pci device %s", __func__, device_get_nameunit(dev))); /* * Walk the bridge hierarchy until we find a PCI-e root * port or a non-PCI device. */ for (;;) { bus = device_get_parent(dev); KASSERT(bus != NULL, ("%s: null parent of %s", __func__, device_get_nameunit(dev))); pcib = device_get_parent(bus); KASSERT(pcib != NULL, ("%s: null bridge of %s", __func__, device_get_nameunit(bus))); /* * pcib's parent must be a PCI bus for this to be a * PCI-PCI bridge. */ if (device_get_devclass(device_get_parent(pcib)) != pci_class) return (NULL); dinfo = device_get_ivars(pcib); if (dinfo->cfg.pcie.pcie_location != 0 && dinfo->cfg.pcie.pcie_type == PCIEM_TYPE_ROOT_PORT) return (pcib); dev = pcib; } } /* * Wait for pending transactions to complete on a PCI-express function. * * The maximum delay is specified in milliseconds in max_delay. Note * that this function may sleep. * * Returns true if the function is idle and false if the timeout is * exceeded. If dev is not a PCI-express function, this returns true. */ bool pcie_wait_for_pending_transactions(device_t dev, u_int max_delay) { struct pci_devinfo *dinfo = device_get_ivars(dev); uint16_t sta; int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (true); sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2); while (sta & PCIEM_STA_TRANSACTION_PND) { if (max_delay == 0) return (false); /* Poll once every 100 milliseconds up to the timeout. */ if (max_delay > 100) { pause_sbt("pcietp", 100 * SBT_1MS, 0, C_HARDCLOCK); max_delay -= 100; } else { pause_sbt("pcietp", max_delay * SBT_1MS, 0, C_HARDCLOCK); max_delay = 0; } sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2); } return (true); } /* * Determine the maximum Completion Timeout in microseconds. * * For non-PCI-express functions this returns 0. */ int pcie_get_max_completion_timeout(device_t dev) { struct pci_devinfo *dinfo = device_get_ivars(dev); int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (0); /* * Functions using the 1.x spec use the default timeout range of * 50 microseconds to 50 milliseconds. Functions that do not * support programmable timeouts also use this range. */ if ((dinfo->cfg.pcie.pcie_flags & PCIEM_FLAGS_VERSION) < 2 || (pci_read_config(dev, cap + PCIER_DEVICE_CAP2, 4) & PCIEM_CAP2_COMP_TIMO_RANGES) == 0) return (50 * 1000); switch (pci_read_config(dev, cap + PCIER_DEVICE_CTL2, 2) & PCIEM_CTL2_COMP_TIMO_VAL) { case PCIEM_CTL2_COMP_TIMO_100US: return (100); case PCIEM_CTL2_COMP_TIMO_10MS: return (10 * 1000); case PCIEM_CTL2_COMP_TIMO_55MS: return (55 * 1000); case PCIEM_CTL2_COMP_TIMO_210MS: return (210 * 1000); case PCIEM_CTL2_COMP_TIMO_900MS: return (900 * 1000); case PCIEM_CTL2_COMP_TIMO_3500MS: return (3500 * 1000); case PCIEM_CTL2_COMP_TIMO_13S: return (13 * 1000 * 1000); case PCIEM_CTL2_COMP_TIMO_64S: return (64 * 1000 * 1000); default: return (50 * 1000); } } void pcie_apei_error(device_t dev, int sev, uint8_t *aerp) { struct pci_devinfo *dinfo = device_get_ivars(dev); const char *s; int aer; uint32_t r, r1; uint16_t rs; if (sev == PCIEM_STA_CORRECTABLE_ERROR) s = "Correctable"; else if (sev == PCIEM_STA_NON_FATAL_ERROR) s = "Uncorrectable (Non-Fatal)"; else s = "Uncorrectable (Fatal)"; device_printf(dev, "%s PCIe error reported by APEI\n", s); if (aerp) { if (sev == PCIEM_STA_CORRECTABLE_ERROR) { r = le32dec(aerp + PCIR_AER_COR_STATUS); r1 = le32dec(aerp + PCIR_AER_COR_MASK); } else { r = le32dec(aerp + PCIR_AER_UC_STATUS); r1 = le32dec(aerp + PCIR_AER_UC_MASK); } device_printf(dev, "status 0x%08x mask 0x%08x", r, r1); if (sev != PCIEM_STA_CORRECTABLE_ERROR) { r = le32dec(aerp + PCIR_AER_UC_SEVERITY); rs = le16dec(aerp + PCIR_AER_CAP_CONTROL); printf(" severity 0x%08x first %d\n", r, rs & 0x1f); } else printf("\n"); } /* As kind of recovery just report and clear the error statuses. */ if (pci_find_extcap(dev, PCIZ_AER, &aer) == 0) { r = pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4); if (r != 0) { pci_write_config(dev, aer + PCIR_AER_UC_STATUS, r, 4); device_printf(dev, "Clearing UC AER errors 0x%08x\n", r); } r = pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4); if (r != 0) { pci_write_config(dev, aer + PCIR_AER_COR_STATUS, r, 4); device_printf(dev, "Clearing COR AER errors 0x%08x\n", r); } } if (dinfo->cfg.pcie.pcie_location != 0) { rs = pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_STA, 2); if ((rs & (PCIEM_STA_CORRECTABLE_ERROR | PCIEM_STA_NON_FATAL_ERROR | PCIEM_STA_FATAL_ERROR | PCIEM_STA_UNSUPPORTED_REQ)) != 0) { pci_write_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_STA, rs, 2); device_printf(dev, "Clearing PCIe errors 0x%04x\n", rs); } } } /* * Perform a Function Level Reset (FLR) on a device. * * This function first waits for any pending transactions to complete * within the timeout specified by max_delay. If transactions are * still pending, the function will return false without attempting a * reset. * * If dev is not a PCI-express function or does not support FLR, this * function returns false. * * Note that no registers are saved or restored. The caller is * responsible for saving and restoring any registers including * PCI-standard registers via pci_save_state() and * pci_restore_state(). */ bool pcie_flr(device_t dev, u_int max_delay, bool force) { struct pci_devinfo *dinfo = device_get_ivars(dev); uint16_t cmd, ctl; int compl_delay; int cap; cap = dinfo->cfg.pcie.pcie_location; if (cap == 0) return (false); if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR)) return (false); /* * Disable busmastering to prevent generation of new * transactions while waiting for the device to go idle. If * the idle timeout fails, the command register is restored * which will re-enable busmastering. */ cmd = pci_read_config(dev, PCIR_COMMAND, 2); pci_write_config(dev, PCIR_COMMAND, cmd & ~(PCIM_CMD_BUSMASTEREN), 2); if (!pcie_wait_for_pending_transactions(dev, max_delay)) { if (!force) { pci_write_config(dev, PCIR_COMMAND, cmd, 2); return (false); } pci_printf(&dinfo->cfg, "Resetting with transactions pending after %d ms\n", max_delay); /* * Extend the post-FLR delay to cover the maximum * Completion Timeout delay of anything in flight * during the FLR delay. Enforce a minimum delay of * at least 10ms. */ compl_delay = pcie_get_max_completion_timeout(dev) / 1000; if (compl_delay < 10) compl_delay = 10; } else compl_delay = 0; /* Initiate the reset. */ ctl = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2); pci_write_config(dev, cap + PCIER_DEVICE_CTL, ctl | PCIEM_CTL_INITIATE_FLR, 2); /* Wait for 100ms. */ pause_sbt("pcieflr", (100 + compl_delay) * SBT_1MS, 0, C_HARDCLOCK); if (pci_read_config(dev, cap + PCIER_DEVICE_STA, 2) & PCIEM_STA_TRANSACTION_PND) pci_printf(&dinfo->cfg, "Transactions pending after FLR!\n"); return (true); } /* * Attempt a power-management reset by cycling the device in/out of D3 * state. PCI spec says we can only go into D3 state from D0 state. * Transition from D[12] into D0 before going to D3 state. */ int pci_power_reset(device_t dev) { int ps; ps = pci_get_powerstate(dev); if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3) pci_set_powerstate(dev, PCI_POWERSTATE_D0); pci_set_powerstate(dev, PCI_POWERSTATE_D3); pci_set_powerstate(dev, ps); return (0); } /* * Try link drop and retrain of the downstream port of upstream * switch, for PCIe. According to the PCIe 3.0 spec 6.6.1, this must * cause Conventional Hot reset of the device in the slot. * Alternative, for PCIe, could be the secondary bus reset initiatied * on the upstream switch PCIR_BRIDGECTL_1, bit 6. */ int pcie_link_reset(device_t port, int pcie_location) { uint16_t v; v = pci_read_config(port, pcie_location + PCIER_LINK_CTL, 2); v |= PCIEM_LINK_CTL_LINK_DIS; pci_write_config(port, pcie_location + PCIER_LINK_CTL, v, 2); pause_sbt("pcier1", mstosbt(20), 0, 0); v &= ~PCIEM_LINK_CTL_LINK_DIS; v |= PCIEM_LINK_CTL_RETRAIN_LINK; pci_write_config(port, pcie_location + PCIER_LINK_CTL, v, 2); pause_sbt("pcier2", mstosbt(100), 0, 0); /* 100 ms */ v = pci_read_config(port, pcie_location + PCIER_LINK_STA, 2); return ((v & PCIEM_LINK_STA_TRAINING) != 0 ? ETIMEDOUT : 0); } static int pci_reset_post(device_t dev, device_t child) { if (dev == device_get_parent(child)) pci_restore_state(child); return (0); } static int pci_reset_prepare(device_t dev, device_t child) { if (dev == device_get_parent(child)) pci_save_state(child); return (0); } static int pci_reset_child(device_t dev, device_t child, int flags) { int error; if (dev == NULL || device_get_parent(child) != dev) return (0); if ((flags & DEVF_RESET_DETACH) != 0) { error = device_get_state(child) == DS_ATTACHED ? device_detach(child) : 0; } else { error = BUS_SUSPEND_CHILD(dev, child); } if (error == 0) { if (!pcie_flr(child, 1000, false)) { error = BUS_RESET_PREPARE(dev, child); if (error == 0) pci_power_reset(child); BUS_RESET_POST(dev, child); } if ((flags & DEVF_RESET_DETACH) != 0) device_probe_and_attach(child); else BUS_RESUME_CHILD(dev, child); } return (error); } const struct pci_device_table * pci_match_device(device_t child, const struct pci_device_table *id, size_t nelt) { bool match; uint16_t vendor, device, subvendor, subdevice, class, subclass, revid; vendor = pci_get_vendor(child); device = pci_get_device(child); subvendor = pci_get_subvendor(child); subdevice = pci_get_subdevice(child); class = pci_get_class(child); subclass = pci_get_subclass(child); revid = pci_get_revid(child); while (nelt-- > 0) { match = true; if (id->match_flag_vendor) match &= vendor == id->vendor; if (id->match_flag_device) match &= device == id->device; if (id->match_flag_subvendor) match &= subvendor == id->subvendor; if (id->match_flag_subdevice) match &= subdevice == id->subdevice; if (id->match_flag_class) match &= class == id->class_id; if (id->match_flag_subclass) match &= subclass == id->subclass; if (id->match_flag_revid) match &= revid == id->revid; if (match) return (id); id++; } return (NULL); } static void pci_print_faulted_dev_name(const struct pci_devinfo *dinfo) { const char *dev_name; device_t dev; dev = dinfo->cfg.dev; printf("pci%d:%d:%d:%d", dinfo->cfg.domain, dinfo->cfg.bus, dinfo->cfg.slot, dinfo->cfg.func); dev_name = device_get_name(dev); if (dev_name != NULL) printf(" (%s%d)", dev_name, device_get_unit(dev)); } void pci_print_faulted_dev(void) { struct pci_devinfo *dinfo; device_t dev; int aer, i; uint32_t r1, r2; uint16_t status; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { dev = dinfo->cfg.dev; status = pci_read_config(dev, PCIR_STATUS, 2); status &= PCIM_STATUS_MDPERR | PCIM_STATUS_STABORT | PCIM_STATUS_RTABORT | PCIM_STATUS_RMABORT | PCIM_STATUS_SERR | PCIM_STATUS_PERR; if (status != 0) { pci_print_faulted_dev_name(dinfo); printf(" error 0x%04x\n", status); } if (dinfo->cfg.pcie.pcie_location != 0) { status = pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_STA, 2); if ((status & (PCIEM_STA_CORRECTABLE_ERROR | PCIEM_STA_NON_FATAL_ERROR | PCIEM_STA_FATAL_ERROR | PCIEM_STA_UNSUPPORTED_REQ)) != 0) { pci_print_faulted_dev_name(dinfo); printf(" PCIe DEVCTL 0x%04x DEVSTA 0x%04x\n", pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_CTL, 2), status); } } if (pci_find_extcap(dev, PCIZ_AER, &aer) == 0) { r1 = pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4); r2 = pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4); if (r1 != 0 || r2 != 0) { pci_print_faulted_dev_name(dinfo); printf(" AER UC 0x%08x Mask 0x%08x Svr 0x%08x\n" " COR 0x%08x Mask 0x%08x Ctl 0x%08x\n", r1, pci_read_config(dev, aer + PCIR_AER_UC_MASK, 4), pci_read_config(dev, aer + PCIR_AER_UC_SEVERITY, 4), r2, pci_read_config(dev, aer + PCIR_AER_COR_MASK, 4), pci_read_config(dev, aer + PCIR_AER_CAP_CONTROL, 4)); for (i = 0; i < 4; i++) { r1 = pci_read_config(dev, aer + PCIR_AER_HEADER_LOG + i * 4, 4); printf(" HL%d: 0x%08x\n", i, r1); } } } } } #ifdef DDB -DB_SHOW_COMMAND(pcierr, pci_print_faulted_dev_db) +DB_SHOW_COMMAND_FLAGS(pcierr, pci_print_faulted_dev_db, DB_CMD_MEMSAFE) { pci_print_faulted_dev(); } static void db_clear_pcie_errors(const struct pci_devinfo *dinfo) { device_t dev; int aer; uint32_t r; dev = dinfo->cfg.dev; r = pci_read_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_STA, 2); pci_write_config(dev, dinfo->cfg.pcie.pcie_location + PCIER_DEVICE_STA, r, 2); if (pci_find_extcap(dev, PCIZ_AER, &aer) != 0) return; r = pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4); if (r != 0) pci_write_config(dev, aer + PCIR_AER_UC_STATUS, r, 4); r = pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4); if (r != 0) pci_write_config(dev, aer + PCIR_AER_COR_STATUS, r, 4); } -DB_COMMAND(pci_clearerr, db_pci_clearerr) +DB_COMMAND_FLAGS(pci_clearerr, db_pci_clearerr, DB_CMD_MEMSAFE) { struct pci_devinfo *dinfo; device_t dev; uint16_t status, status1; STAILQ_FOREACH(dinfo, &pci_devq, pci_links) { dev = dinfo->cfg.dev; status1 = status = pci_read_config(dev, PCIR_STATUS, 2); status1 &= PCIM_STATUS_MDPERR | PCIM_STATUS_STABORT | PCIM_STATUS_RTABORT | PCIM_STATUS_RMABORT | PCIM_STATUS_SERR | PCIM_STATUS_PERR; if (status1 != 0) { status &= ~status1; pci_write_config(dev, PCIR_STATUS, status, 2); } if (dinfo->cfg.pcie.pcie_location != 0) db_clear_pcie_errors(dinfo); } } #endif diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index c096ea220146..5bec6196ba27 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -1,1875 +1,1875 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2018 The FreeBSD Foundation * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_perfmon.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_APIC #include #endif #ifdef DEV_ISA #include #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); register_t init386(int first); void dblfault_handler(void); void identify_cpu(void); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel; u_int basemem; static int above4g_allow = 1; static int above24g_allow = 0; int cold = 1; long Maxmem = 0; long realmem = 0; int late_console = 1; #ifdef PAE FEATURE(pae, "Physical Address Extensions"); #endif struct kva_md_info kmi; static struct trapframe proc0_tf; struct pcpu __pcpu[MAXCPU]; static void i386_clock_source_init(void); struct mtx icu_lock; struct mem_range_softc mem_range_softc; extern char start_exceptions[], end_exceptions[]; extern struct sysentvec elf32_freebsd_sysvec; /* Default init_ops implementation. */ struct init_ops init_ops = { .early_clock_source_init = i386_clock_source_init, .early_delay = i8254_delay, }; static void i386_clock_source_init(void) { i8254_init(); } static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; struct mtx dt_lock; /* lock for GDT and LDT */ union descriptor gdt0[NGDT]; /* initial global descriptor table */ union descriptor *gdt = gdt0; /* global descriptor table */ union descriptor *ldt; /* local descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static struct i386tss *dblfault_tss; static char *dblfault_stack; static struct i386tss common_tss0; vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = SEL_KPL, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUFS_SEL 2 %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS_SEL 3 %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 6 Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { .ssd_base = 0x400, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GLDT_SEL 10 LDT Descriptor */ { .ssd_base = 0, .ssd_limit = sizeof(union descriptor) * NLDT - 1, .ssd_type = SDT_SYSLDT, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { .ssd_base = 0, .ssd_limit = (512 * sizeof(union descriptor)-1), .ssd_type = SDT_SYSLDT, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { .ssd_base = 0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GNDIS_SEL 18 NDIS Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, }; size_t setidt_disp; void setidt(int idx, inthand_t *func, int typ, int dpl, int selec) { uintptr_t off; off = func != NULL ? (uintptr_t)func + setidt_disp : 0; setidt_nodisp(idx, off, typ, dpl, selec); } void setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = off; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((u_int)off) >> 16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ -DB_SHOW_COMMAND(idt, db_show_idt) +DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) { struct gate_descriptor *ip; int idx; uintptr_t func, func_trm; bool trm; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { if (ip->gd_type == SDT_SYSTASKGT) { db_printf("%3d\t\n", idx); } else { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func >= PMAP_TRM_MIN_ADDRESS) { func_trm = func; func -= setidt_disp; trm = true; } else trm = false; if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); if (trm) db_printf(" (trampoline %#x)", func_trm); db_printf("\n"); } } ip++; } } /* Show privileged registers. */ -DB_SHOW_COMMAND(sysregs, db_show_sysregs) +DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) { uint64_t idtr, gdtr; idtr = ridt(); db_printf("idtr\t0x%08x/%04x\n", (u_int)(idtr >> 16), (u_int)idtr & 0xffff); gdtr = rgdt(); db_printf("gdtr\t0x%08x/%04x\n", (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); db_printf("ldtr\t0x%04x\n", rldt()); db_printf("tr\t0x%04x\n", rtr()); db_printf("cr0\t0x%08x\n", rcr0()); db_printf("cr2\t0x%08x\n", rcr2()); db_printf("cr3\t0x%08x\n", rcr3()); db_printf("cr4\t0x%08x\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016llx\n", rxcr(0)); if (amd_feature & (AMDID_NX | AMDID_LM)) db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t0x%016llx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); if (((cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) || cpu_vendor_id == CPU_VENDOR_HYGON) db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); if (cpu_feature & CPUID_PAT) db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); } -DB_SHOW_COMMAND(dbregs, db_show_dbregs) +DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) { db_printf("dr0\t0x%08x\n", rdr0()); db_printf("dr1\t0x%08x\n", rdr1()); db_printf("dr2\t0x%08x\n", rdr2()); db_printf("dr3\t0x%08x\n", rdr3()); db_printf("dr6\t0x%08x\n", rdr6()); - db_printf("dr7\t0x%08x\n", rdr7()); + db_printf("dr7\t0x%08x\n", rdr7()); } DB_SHOW_COMMAND(frame, db_show_frame) { struct trapframe *frame; frame = have_addr ? (struct trapframe *)addr : curthread->td_frame; printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n", frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs, frame->tf_eip); printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno); printf("ds %#x es %#x fs %#x\n", frame->tf_ds, frame->tf_es, frame->tf_fs); printf("eax %#x ecx %#x edx %#x ebx %#x\n", frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx); printf("ebp %#x esi %#x edi %#x\n", frame->tf_ebp, frame->tf_esi, frame->tf_edi); } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { uint64_t lim, ign; int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); lim = 0x100000000; /* 4G */ if (pae_mode && above4g_allow) lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */ if (base >= lim) { printf("%uK of memory above %uGB ignored, pae %d " "above4g_allow %d above24g_allow %d\n", (u_int)(length / 1024), (u_int)(lim >> 30), pae_mode, above4g_allow, above24g_allow); return (1); } if (base + length >= lim) { ign = base + length - lim; length -= ign; printf("%uK of memory above %uGB ignored, pae %d " "above4g_allow %d above24g_allow %d\n", (u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode, above4g_allow, above24g_allow); } /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = physmap_idx + 2; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } static int add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) return (1); return (add_physmap_entry(smap->base, smap->length, physmap, physmap_idxp)); } static void add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, int *physmap_idxp) { struct bios_smap *smap, *smapend; u_int32_t smapsize; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes SMAP. */ smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } static void basemem_setup(void) { if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } pmap_basemem_setup(basemem); } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int has_smap, off, physmap_idx, pa_indx, da_indx; u_long memtest; vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; quad_t dcons_addr, dcons_size, physmem_tunable; int hasbrokenint12, i, res __diagused; u_int extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; has_smap = 0; bzero(&vmf, sizeof(vmf)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow); TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow); /* * Check if the loader supplied an SMAP memory map. If so, * use that and do not make any VM86 calls. */ physmap_idx = 0; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { add_smap_entries(smapbase, physmap, &physmap_idx); has_smap = 1; goto have_smap; } /* * Some newer BIOSes have a broken INT 12H implementation * which causes a kernel panic immediately. In this case, we * need use the SMAP to determine the base memory size. */ hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); if (hasbrokenint12 == 0) { /* Use INT12 to determine base memory size. */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; basemem_setup(); } /* * Fetch the memory map with INT 15:E820. Map page 1 R/W into * the kernel page table so we can use it as a buffer. The * kernel will unmap this page later. */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1)); res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); KASSERT(res != 0, ("vm86_getptr() failed: address not found")); vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; has_smap = 1; if (!add_smap_entry(smap, physmap, &physmap_idx)) break; } while (vmf.vmf_ebx != 0); have_smap: /* * If we didn't fetch the "base memory" size from INT12, * figure it out from the SMAP (or just guess). */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } /* XXX: If we couldn't find basemem from SMAP, just guess. */ if (basemem == 0) basemem = 640; basemem_setup(); } if (physmap[1] != 0) goto physmap_done; /* * If we failed to find an SMAP, figure out the extended * memory size. We will then build a simple memory map with * two segments, one for "base memory" and the second for * "extended memory". Note that "extended memory" starts at a * physical address of 1MB and that both basemem and extmem * are in units of 1KB. * * First, try to fetch the extended memory size via INT 15:E801. */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { /* * If INT15:E801 fails, this is our last ditch effort * to determine the extended memory size. Currently * we prefer the RTC value over INT15:88. */ #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ alloc_ap_trampoline(physmap, &physmap_idx); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. * * This is especially confusing when it is much larger than the * memory size and is displayed as "realmem". */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend * the amount of memory in the system. */ if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * map page into kernel: valid, read/write,non-cacheable */ ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } pmap_cmap3(0, 0); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } static void i386_kdb_init(void) { #ifdef DDB db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab, 0); #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } static void fixup_idt(void) { struct gate_descriptor *ip; uintptr_t off; int x; for (x = 0; x < NIDT; x++) { ip = &idt[x]; if (ip->gd_type != SDT_SYS386IGT && ip->gd_type != SDT_SYS386TGT) continue; off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); KASSERT(off >= (uintptr_t)start_exceptions && off < (uintptr_t)end_exceptions, ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); off += setidt_disp; MPASS(off >= PMAP_TRM_MIN_ADDRESS && off < PMAP_TRM_MAX_ADDRESS); ip->gd_looffset = off; ip->gd_hioffset = off >> 16; } } static void i386_setidt1(void) { int x; /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif } static void i386_setidt2(void) { setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); } #if defined(DEV_ISA) && !defined(DEV_ATPIC) static void i386_setidt3(void) { setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); } #endif register_t init386(int first) { struct region_descriptor r_gdt, r_idt; /* table descriptors */ int gsel_tss, metadata_missing, x, pa; struct pcpu *pc; struct xstate_hdr *xhdr; caddr_t kmdp; vm_offset_t addend; size_t ucode_len; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = TD0_KSTACK_PAGES; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); if (bootinfo.bi_modulep) { metadata_missing = 0; addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? PMAP_MAP_LOW : 0; preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; preload_bootstrap_relocate(addend); } else { metadata_missing = 1; } if (bootinfo.bi_envp != 0) { addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? PMAP_MAP_LOW : 0; init_static_kenv((char *)bootinfo.bi_envp + addend, 0); } else { init_static_kenv(NULL, 0); } /* * Re-evaluate CPU features if we loaded a microcode update. */ ucode_len = ucode_load_bsp(first); if (ucode_len != 0) { identify_cpu(); first = roundup2(first + ucode_len, PAGE_SIZE); } identify_hypervisor(); /* Init basic tunables, hz etc */ init_param1(); /* Set bootmethod to BIOS: it's the only supported on i386. */ strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); pc = &__pcpu[0]; gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int)pc; gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt0[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1; r_gdt.rd_base = (int)gdt0; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) pmap_kenter(pa, pa); dpcpu_init((void *)first, 0); first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); i386_setidt1(); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); finishidentcpu(); /* Final stage of CPU initialization */ /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); i386_setidt2(); pmap_set_nx(); initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); /* Initialize the tss (except for the final esp0) early for vm86. */ common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - VM86_STACK_SPACE; common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); /* Initialize the PIC early for vm86 calls. */ #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ i386_setidt3(); #endif #endif /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); i386_kdb_init(); } kmdp = preload_search_by_type("elf kernel"); link_elf_ireloc(kmdp); vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ if (late_console) cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); if (late_console) i386_kdb_init(); msgbufinit(msgbufp, msgbufsize); npxinit(true); /* * Set up thread0 pcb after npxinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } PCPU_SET(curpcb, thread0.td_pcb); /* Move esp0 in the tss to its final place. */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; PCPU_SET(kesp0, common_tss0.tss_esp0); gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ ltr(gsel_tss); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = pmap_get_kcr3(); thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; #ifdef FDT x86_init_fdt(); #endif /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } static void machdep_init_trampoline(void) { struct region_descriptor r_gdt, r_idt; struct i386tss *tss; char *copyout_buf, *trampoline, *tramp_stack_base; int x; gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, M_NOWAIT | M_ZERO); bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int)gdt; lgdt(&r_gdt); tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, M_NOWAIT | M_ZERO); bcopy(&common_tss0, tss, sizeof(struct i386tss)); gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tssp, tss); ltr(GSEL(GPROC0_SEL, SEL_KPL)); trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, M_NOWAIT); bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ - VM86_STACK_SPACE); tss[0].tss_esp0 = PCPU_GET(trampstk); idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); bcopy(idt0, idt, sizeof(idt0)); /* Re-initialize new IDT since the handlers were relocated */ setidt_disp = trampoline - start_exceptions; fixup_idt(); r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; r_idt.rd_base = (int)idt; lidt(&r_idt); /* dblfault TSS */ dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = (int)dblfault_stack + PAGE_SIZE; dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss->tss_cr3 = pmap_get_kcr3(); dblfault_tss->tss_eip = (int)dblfault_handler; dblfault_tss->tss_eflags = PSL_KERNEL; dblfault_tss->tss_ds = dblfault_tss->tss_es = dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; /* make ldt memory segments */ ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, M_NOWAIT | M_ZERO); gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < nitems(ldt_segs); x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); PCPU_SET(copyout_buf, copyout_buf); copyout_init_tramp(); } SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); #ifdef COMPAT_43 static void i386_setup_lcall_gate(void) { struct sysentvec *sv; struct user_segment_descriptor desc; u_int lcall_addr; sv = &elf32_freebsd_sysvec; lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; bzero(&desc, sizeof(desc)); desc.sd_type = SDT_MEMERA; desc.sd_dpl = SEL_UPL; desc.sd_p = 1; desc.sd_def32 = 1; desc.sd_gran = 1; desc.sd_lolimit = 0xffff; desc.sd_hilimit = 0xf; desc.sd_lobase = lcall_addr; desc.sd_hibase = lcall_addr >> 24; bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); } SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); #endif void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct region_descriptor r_idt; struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); if (tmp == 0) panic("kmem_malloc returned 0"); tmp = round_page(tmp); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor *) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; r_idt.rd_limit = sizeof(idt0) - 1; lidt(&r_idt); /* SMP machines do not need the F00F hack. */ idt = new_idt; pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; pcb->pcb_gs = rgs(); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index eb1323d12f61..1974c4e68ce4 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,909 +1,909 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; int linux_alloc_current_noop(struct thread *td __unused, int flags __unused) { return (0); } int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef VERBOSE_SYSINIT /* * We'll use the defined value of VERBOSE_SYSINIT from the kernel config to * dictate the default VERBOSE_SYSINIT behavior. Significant values for this * option and associated tunable are: * - 0, 'compiled in but silent by default' * - 1, 'compiled in but verbose by default' (default) */ int verbose_sysinit = VERBOSE_SYSINIT; TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit); #endif #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ int last; #if defined(VERBOSE_SYSINIT) int verbose; #endif TSENTER(); if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } last = SI_SUB_COPYRIGHT; #if defined(VERBOSE_SYSINIT) verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; if ((*sipp)->subsystem > last) BOOTTRACE_INIT("sysinit 0x%7x", (*sipp)->subsystem); #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last && verbose_sysinit != 0) { verbose = 1; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ last = (*sipp)->subsystem; (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } TSEXIT(); /* Here so we don't overlap with start_init. */ BOOTTRACE("mi_startup done"); mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } static void null_set_fork_retval(struct thread *td __unused) { } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_set_fork_retval = null_set_fork_retval, .sv_regset_begin = NULL, .sv_regset_end = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; td->td_tid = THREAD0_TID; tidhash_add(td); TD_SET_STATE(td, TDS_RUNNING); td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; p->p_treeflag |= P_TREE_REAPER; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); TAILQ_INIT(&p->p_kqtim_stop); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; newcred->cr_users++; /* avoid assertion failure */ proc_set_cred_init(p, newcred); newcred->cr_users--; crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_pd = pdinit(NULL, false); p->p_fd = fdinit(); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_free_count()); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; refcount_init(&vmspace0.vm_refcnt, 1); pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); #ifdef KDTRACE_HOOKS kdtrace_proc_ctor(p); kdtrace_thread_ctor(td); #endif EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; PROC_STATUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { struct image_args args; int error; char *var, *path; char *free_init_path, *tmp_init_path; struct thread *td; struct proc *p; struct vmspace *oldvmspace; TSENTER(); /* Here so we don't overlap with mi_startup. */ td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* For Multicons, report which console is primary to both */ if (boothowto & RB_MULTIPLE) { if (boothowto & RB_SERIAL) printf("Dual Console: Serial Primary, Video Secondary\n"); else printf("Dual Console: Video Primary, Serial Secondary\n"); } if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } free_init_path = tmp_init_path = strdup(init_path, M_TEMP); while ((path = strsep(&tmp_init_path, ":")) != NULL) { if (bootverbose) printf("start_init: trying %s\n", path); memset(&args, 0, sizeof(args)); error = exec_alloc_args(&args); if (error != 0) panic("%s: Can't allocate space for init arguments %d", __func__, error); error = exec_args_add_fname(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add fname %d", __func__, error); error = exec_args_add_arg(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); if (boothowto & RB_SINGLE) error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); oldvmspace = p->p_vmspace; error = kern_execve(td, &args, NULL, oldvmspace); KASSERT(error != 0, ("kern_execve returned success, not EJUSTRETURN")); if (error == EJUSTRETURN) { exec_cleanup(td, oldvmspace); free(free_init_path, M_TEMP); TSEXIT(); return; } if (error != ENOENT) printf("exec %s: error %d\n", path, error); } free(free_init_path, M_TEMP); printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. We do this * early to reserve pid 1. Note special case - do not make it * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crcowfree(td); td->td_realucred = crcowget(initproc->p_ucred); td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); /* * DDB(4). */ #ifdef DDB static void db_show_print_syinit(struct sysinit *sip, bool ddb) { const char *sname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (sip == NULL) { xprint("%s: no sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset); db_symbol_values(sym, &sname, NULL); sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip); xprint(" %#08x %#08x\n", sip->subsystem, sip->order); xprint(" %p(%s)(%p)\n", sip->func, (funcname != NULL) ? funcname : "", sip->udata); #undef xprint } -DB_SHOW_COMMAND(sysinit, db_show_sysinit) +DB_SHOW_COMMAND_FLAGS(sysinit, db_show_sysinit, DB_CMD_MEMSAFE) { struct sysinit **sipp; db_printf("SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); for (sipp = sysinit; sipp < sysinit_end; sipp++) { db_show_print_syinit(*sipp, true); if (db_pager_quit) break; } } #endif /* DDB */ diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index 0670d9a046a7..ed0a666c460c 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -1,2520 +1,2520 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ /* * cpusets provide a mechanism for creating and manipulating sets of * processors for the purpose of constraining the scheduling of threads to * specific processors. * * Each process belongs to an identified set, by default this is set 1. Each * thread may further restrict the cpus it may run on to a subset of this * named set. This creates an anonymous set which other threads and processes * may not join by number. * * The named set is referred to herein as the 'base' set to avoid ambiguity. * This set is usually a child of a 'root' set while the anonymous set may * simply be referred to as a mask. In the syscall api these are referred to * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. * * Threads inherit their set from their creator whether it be anonymous or * not. This means that anonymous sets are immutable because they may be * shared. To modify an anonymous set a new set is created with the desired * mask and the same parent as the existing anonymous set. This gives the * illusion of each thread having a private mask. * * Via the syscall apis a user may ask to retrieve or modify the root, base, * or mask that is discovered via a pid, tid, or setid. Modifying a set * modifies all numbered and anonymous child sets to comply with the new mask. * Modifying a pid or tid's mask applies only to that tid but must still * exist within the assigned parent set. * * A thread may not be assigned to a group separate from other threads in * the process. This is to remove ambiguity when the setid is queried with * a pid argument. There is no other technical limitation. * * This somewhat complex arrangement is intended to make it easy for * applications to query available processors and bind their threads to * specific processors while also allowing administrators to dynamically * reprovision by changing sets which apply to groups of processes. * * A simple application should not concern itself with sets at all and * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ LIST_HEAD(domainlist, domainset); struct domainset __read_mostly domainset_firsttouch; struct domainset __read_mostly domainset_fixed[MAXMEMDOM]; struct domainset __read_mostly domainset_interleave; struct domainset __read_mostly domainset_prefer[MAXMEMDOM]; struct domainset __read_mostly domainset_roundrobin; static uma_zone_t cpuset_zone; static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel; static struct domainset *domainset0, *domainset2; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; static int domainset_valid(const struct domainset *, const struct domainset *); /* * Find the first non-anonymous set starting from 'set'. */ static struct cpuset * cpuset_getbase(struct cpuset *set) { if (set->cs_id == CPUSET_INVALID) set = set->cs_parent; return (set); } /* * Walks up the tree from 'set' to find the root. */ static struct cpuset * cpuset_getroot(struct cpuset *set) { while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) set = set->cs_parent; return (set); } /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * cpuset_ref(struct cpuset *set) { refcount_acquire(&set->cs_ref); return (set); } /* * Walks up the tree from 'set' to find the root. Returns the root * referenced. */ static struct cpuset * cpuset_refroot(struct cpuset *set) { return (cpuset_ref(cpuset_getroot(set))); } /* * Find the first non-anonymous set starting from 'set'. Returns this set * referenced. May return the passed in set with an extra ref if it is * not anonymous. */ static struct cpuset * cpuset_refbase(struct cpuset *set) { return (cpuset_ref(cpuset_getbase(set))); } /* * Release a reference in a context where it is safe to allocate. */ void cpuset_rel(struct cpuset *set) { cpusetid_t id; if (refcount_release_if_not_last(&set->cs_ref)) return; mtx_lock_spin(&cpuset_lock); if (!refcount_release(&set->cs_ref)) { mtx_unlock_spin(&cpuset_lock); return; } LIST_REMOVE(set, cs_siblings); id = set->cs_id; if (id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); mtx_unlock_spin(&cpuset_lock); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Deferred release must be used when in a context that is not safe to * allocate/free. This places any unreferenced sets on the list 'head'. */ static void cpuset_rel_defer(struct setlist *head, struct cpuset *set) { if (refcount_release_if_not_last(&set->cs_ref)) return; mtx_lock_spin(&cpuset_lock); if (!refcount_release(&set->cs_ref)) { mtx_unlock_spin(&cpuset_lock); return; } LIST_REMOVE(set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); LIST_INSERT_HEAD(head, set, cs_link); mtx_unlock_spin(&cpuset_lock); } /* * Complete a deferred release. Removes the set from the list provided to * cpuset_rel_defer. */ static void cpuset_rel_complete(struct cpuset *set) { cpusetid_t id; id = set->cs_id; LIST_REMOVE(set, cs_link); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Find a set based on an id. Returns it with a ref. */ static struct cpuset * cpuset_lookup(cpusetid_t setid, struct thread *td) { struct cpuset *set; if (setid == CPUSET_INVALID) return (NULL); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(set, &cpuset_ids, cs_link) if (set->cs_id == setid) break; if (set) cpuset_ref(set); mtx_unlock_spin(&cpuset_lock); KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { struct cpuset *jset, *tset; jset = td->td_ucred->cr_prison->pr_cpuset; for (tset = set; tset != NULL; tset = tset->cs_parent) if (tset == jset) break; if (tset == NULL) { cpuset_rel(set); set = NULL; } } return (set); } /* * Initialize a set in the space provided in 'set' with the provided parameters. * The set is returned with a single ref. May return EDEADLK if the set * will have no valid cpu based on restrictions from the parent. */ static int cpuset_init(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, struct domainset *domain, cpusetid_t id) { if (domain == NULL) domain = parent->cs_domain; if (mask == NULL) mask = &parent->cs_mask; if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); /* The domain must be prepared ahead of time. */ if (!domainset_valid(parent->cs_domain, domain)) return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); set->cs_domain = domain; CPU_AND(&set->cs_mask, &set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); mtx_unlock_spin(&cpuset_lock); return (0); } /* * Create a new non-anonymous set with the requested parent and mask. May * return failures if the mask is invalid or a new number can not be * allocated. * * If *setp is not NULL, then it will be used as-is. The caller must take * into account that *setp will be inserted at the head of cpuset_ids and * plan any potentially conflicting cs_link usage accordingly. */ static int cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) { struct cpuset *set; cpusetid_t id; int error; bool dofree; id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); dofree = (*setp == NULL); if (*setp != NULL) set = *setp; else *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, parent, mask, NULL, id); if (error == 0) return (0); free_unr(cpuset_unr, id); if (dofree) uma_zfree(cpuset_zone, set); return (error); } static void cpuset_freelist_add(struct setlist *list, int count) { struct cpuset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, cs_link); } } static void cpuset_freelist_init(struct setlist *list, int count) { LIST_INIT(list); cpuset_freelist_add(list, count); } static void cpuset_freelist_free(struct setlist *list) { struct cpuset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, cs_link); uma_zfree(cpuset_zone, set); } } static void domainset_freelist_add(struct domainlist *list, int count) { struct domainset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, ds_link); } } static void domainset_freelist_init(struct domainlist *list, int count) { LIST_INIT(list); domainset_freelist_add(list, count); } static void domainset_freelist_free(struct domainlist *list) { struct domainset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, ds_link); uma_zfree(domainset_zone, set); } } /* Copy a domainset preserving mask and policy. */ static void domainset_copy(const struct domainset *from, struct domainset *to) { DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); to->ds_policy = from->ds_policy; to->ds_prefer = from->ds_prefer; } /* Return 1 if mask and policy are equal, otherwise 0. */ static int domainset_equal(const struct domainset *one, const struct domainset *two) { return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && one->ds_policy == two->ds_policy && one->ds_prefer == two->ds_prefer); } /* Return 1 if child is a valid subset of parent. */ static int domainset_valid(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } static int domainset_restrict(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } /* * Lookup or create a domainset. The key is provided in ds_mask and * ds_policy. If the domainset does not yet exist the storage in * 'domain' is used to insert. Otherwise this storage is freed to the * domainset_zone and the existing domainset is returned. */ static struct domainset * _domainset_create(struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; int i, j; KASSERT(domain->ds_cnt <= vm_ndomains, ("invalid domain count in domainset %p", domain)); KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER || domain->ds_prefer < vm_ndomains, ("invalid preferred domain in domains %p", domain)); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(ndomain, &cpuset_domains, ds_link) if (domainset_equal(ndomain, domain)) break; /* * If the domain does not yet exist we insert it and initialize * various iteration helpers which are not part of the key. */ if (ndomain == NULL) { LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) if (DOMAINSET_ISSET(i, &domain->ds_mask)) domain->ds_order[j++] = i; } mtx_unlock_spin(&cpuset_lock); if (ndomain == NULL) return (domain); if (freelist != NULL) LIST_INSERT_HEAD(freelist, domain, ds_link); else uma_zfree(domainset_zone, domain); return (ndomain); } /* * Are any of the domains in the mask empty? If so, silently * remove them and update the domainset accordingly. If only empty * domains are present, we must return failure. */ static bool domainset_empty_vm(struct domainset *domain) { domainset_t empty; int i, j; DOMAINSET_ZERO(&empty); for (i = 0; i < vm_ndomains; i++) if (VM_DOMAIN_EMPTY(i)) DOMAINSET_SET(i, &empty); if (DOMAINSET_SUBSET(&empty, &domain->ds_mask)) return (true); /* Remove empty domains from the set and recompute. */ DOMAINSET_ANDNOT(&domain->ds_mask, &empty); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) if (DOMAINSET_ISSET(i, &domain->ds_mask)) domain->ds_order[j++] = i; /* Convert a PREFER policy referencing an empty domain to RR. */ if (domain->ds_policy == DOMAINSET_POLICY_PREFER && DOMAINSET_ISSET(domain->ds_prefer, &empty)) { domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; domain->ds_prefer = -1; } return (false); } /* * Create or lookup a domainset based on the key held in 'domain'. */ struct domainset * domainset_create(const struct domainset *domain) { struct domainset *ndomain; /* * Validate the policy. It must specify a useable policy number with * only valid domains. Preferred must include the preferred domain * in the mask. */ if (domain->ds_policy <= DOMAINSET_POLICY_INVALID || domain->ds_policy > DOMAINSET_POLICY_MAX) return (NULL); if (domain->ds_policy == DOMAINSET_POLICY_PREFER && !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask)) return (NULL); if (!DOMAINSET_SUBSET(&domainset0->ds_mask, &domain->ds_mask)) return (NULL); ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); domainset_copy(domain, ndomain); return _domainset_create(ndomain, NULL); } /* * Update thread domainset pointers. */ static void domainset_notify(void) { struct thread *td; struct proc *p; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_domain.dr_policy = td->td_cpuset->cs_domain; thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; } /* * Create a new set that is a subset of a parent. */ static struct domainset * domainset_shadow(const struct domainset *pdomain, const struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; ndomain = LIST_FIRST(freelist); LIST_REMOVE(ndomain, ds_link); /* * Initialize the key from the request. */ domainset_copy(domain, ndomain); /* * Restrict the key by the parent. */ DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); return _domainset_create(ndomain, freelist); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int augment_mask) { struct cpuset *nset; cpuset_t newmask; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); if (augment_mask) { CPU_AND(&newmask, &set->cs_mask, mask); } else CPU_COPY(mask, &newmask); if (CPU_EMPTY(&newmask)) return (EDEADLK); error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update(struct cpuset *set, cpuset_t *mask) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); CPU_AND(&set->cs_mask, &set->cs_mask, mask); LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update(nset, &set->cs_mask); return; } /* * Modify the set 'set' to use a copy of the mask provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify(struct cpuset *set, cpuset_t *mask) { struct cpuset *root; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail, * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets, including subordinate jails' * roots. */ if ((set->cs_flags & CPU_SET_ROOT) != 0 && jailed(curthread->td_ucred) && set == curthread->td_ucred->cr_prison->pr_cpuset) return (EPERM); /* * Verify that we have access to this set of * cpus. */ if ((set->cs_flags & (CPU_SET_ROOT | CPU_SET_RDONLY)) == CPU_SET_ROOT) { KASSERT(set->cs_parent != NULL, ("jail.cpuset=%d is not a proper child of parent jail's root.", set->cs_id)); /* * cpuset_getroot() cannot work here due to how top-level jail * roots are constructed. Top-level jails are parented to * thread0's cpuset (i.e. cpuset 1) rather than the system root. */ root = set->cs_parent; } else { root = cpuset_getroot(set); } mtx_lock_spin(&cpuset_lock); if (root && !CPU_SUBSET(&root->cs_mask, mask)) { error = EINVAL; goto out; } error = cpuset_testupdate(set, mask, 0); if (error) goto out; CPU_COPY(mask, &set->cs_mask); cpuset_update(set, mask); out: mtx_unlock_spin(&cpuset_lock); return (error); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, struct domainset *orig, int *count, int augment_mask __unused) { struct cpuset *nset; struct domainset *domain; struct domainset newset; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); domain = set->cs_domain; domainset_copy(domain, &newset); if (!domainset_equal(domain, orig)) { if (!domainset_restrict(domain, dset)) return (EDEADLK); DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); /* Count the number of domains that are changing. */ (*count)++; } error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate_domain(nset, &newset, domain, count, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update_domain(struct cpuset *set, struct domainset *domain, struct domainset *orig, struct domainlist *domains) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); /* * If this domainset has changed from the parent we must calculate * a new set. Otherwise it simply inherits from the parent. When * we inherit from the parent we get a new mask and policy. If the * set is modified from the parent we keep the policy and only * update the mask. */ if (set->cs_domain != orig) { orig = set->cs_domain; set->cs_domain = domainset_shadow(domain, orig, domains); } else set->cs_domain = domain; LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update_domain(nset, set->cs_domain, orig, domains); return; } /* * Modify the set 'set' to use a copy the domainset provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify_domain(struct cpuset *set, struct domainset *domain) { struct domainlist domains; struct domainset temp; struct domainset *dset; struct cpuset *root; int ndomains, needed; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); domainset_freelist_init(&domains, 0); domain = domainset_create(domain); ndomains = 0; mtx_lock_spin(&cpuset_lock); for (;;) { root = cpuset_getroot(set); dset = root->cs_domain; /* * Verify that we have access to this set of domains. */ if (!domainset_valid(dset, domain)) { error = EINVAL; goto out; } /* * If applying prefer we keep the current set as the fallback. */ if (domain->ds_policy == DOMAINSET_POLICY_PREFER) DOMAINSET_COPY(&set->cs_domain->ds_mask, &domain->ds_mask); /* * Determine whether we can apply this set of domains and * how many new domain structures it will require. */ domainset_copy(domain, &temp); needed = 0; error = cpuset_testupdate_domain(set, &temp, set->cs_domain, &needed, 0); if (error) goto out; if (ndomains >= needed) break; /* Dropping the lock; we'll need to re-evaluate again. */ mtx_unlock_spin(&cpuset_lock); domainset_freelist_add(&domains, needed - ndomains); ndomains = needed; mtx_lock_spin(&cpuset_lock); } dset = set->cs_domain; cpuset_update_domain(set, domain, dset, &domains); out: mtx_unlock_spin(&cpuset_lock); domainset_freelist_free(&domains); if (error == 0) domainset_notify(); return (error); } /* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also * checks for permission via p_cansched(). * * For WHICH_SET returns a valid set with a new reference. * * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { struct cpuset *set; struct thread *td; struct proc *p; int error; *pp = p = NULL; *tdp = td = NULL; *setp = set = NULL; switch (which) { case CPU_WHICH_PID: if (id == -1) { PROC_LOCK(curproc); p = curproc; break; } if ((p = pfind(id)) == NULL) return (ESRCH); break; case CPU_WHICH_TID: if (id == -1) { PROC_LOCK(curproc); p = curproc; td = curthread; break; } td = tdfind(id, -1); if (td == NULL) return (ESRCH); p = td->td_proc; break; case CPU_WHICH_CPUSET: if (id == -1) { thread_lock(curthread); set = cpuset_refbase(curthread->td_cpuset); thread_unlock(curthread); } else set = cpuset_lookup(id, curthread); if (set) { *setp = set; return (0); } return (ESRCH); case CPU_WHICH_JAIL: { /* Find `set' for prison with given id. */ struct prison *pr; sx_slock(&allprison_lock); pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); cpuset_ref(pr->pr_cpuset); *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); return (0); } case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (0); default: return (EINVAL); } error = p_cansched(curthread, p); if (error) { PROC_UNLOCK(p); return (error); } if (td == NULL) td = FIRST_THREAD_IN_PROC(p); *pp = p; *tdp = td; return (0); } static int cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, const struct domainset *domain) { struct cpuset *parent; struct domainset *dset; parent = cpuset_getbase(set); /* * If we are restricting a cpu mask it must be a subset of the * parent or invalid CPUs have been specified. */ if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) return (EINVAL); /* * If we are restricting a domain mask it must be a subset of the * parent or invalid domains have been specified. */ dset = parent->cs_domain; if (domain != NULL && !domainset_valid(dset, domain)) return (EINVAL); return (0); } /* * Create an anonymous set with the provided mask in the space provided by * 'nset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, const cpuset_t *mask, const struct domainset *domain, struct setlist *cpusets, struct domainlist *domains) { struct cpuset *parent; struct cpuset *nset; struct domainset *dset; struct domainset *d; int error; error = cpuset_testshadow(set, mask, domain); if (error) return (error); parent = cpuset_getbase(set); dset = parent->cs_domain; if (mask == NULL) mask = &set->cs_mask; if (domain != NULL) d = domainset_shadow(dset, domain, domains); else d = set->cs_domain; nset = LIST_FIRST(cpusets); error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID); if (error == 0) { LIST_REMOVE(nset, cs_link); *nsetp = nset; } return (error); } static struct cpuset * cpuset_update_thread(struct thread *td, struct cpuset *nset) { struct cpuset *tdset; tdset = td->td_cpuset; td->td_cpuset = nset; td->td_domain.dr_policy = nset->cs_domain; sched_affinity(td); return (tdset); } static int cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_testshadow(parent, mask, domain); } static int cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_shadow(parent, nsetp, mask, domain, freelist, domainlist); } static int cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); /* * If the thread restricted its mask then apply that same * restriction to the new set, otherwise take it wholesale. */ if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { CPU_AND(mask, &tdset->cs_mask, &set->cs_mask); } else CPU_COPY(&set->cs_mask, mask); /* * If the thread restricted the domain then we apply the * restriction to the new set but retain the policy. */ if (tdset->cs_domain != parent->cs_domain) { domainset_copy(tdset->cs_domain, domain); DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); } else domainset_copy(set->cs_domain, domain); if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) return (EDEADLK); return (0); } static int cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) { struct domainset domain; cpuset_t mask; if (tdset->cs_id != CPUSET_INVALID) return (0); return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); } static int cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct domainset domain; cpuset_t mask; int error; /* * If we're replacing on a thread that has not constrained the * original set we can simply accept the new set. */ if (tdset->cs_id != CPUSET_INVALID) { *nsetp = cpuset_ref(set); return (0); } error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); if (error) return (error); return cpuset_shadow(set, nsetp, &mask, &domain, freelist, domainlist); } static int cpuset_setproc_newbase(struct thread *td, struct cpuset *set, struct cpuset *nroot, struct cpuset **nsetp, struct setlist *cpusets, struct domainlist *domainlist) { struct domainset ndomain; cpuset_t nmask; struct cpuset *pbase; int error; pbase = cpuset_getbase(td->td_cpuset); /* Copy process mask, then further apply the new root mask. */ CPU_AND(&nmask, &pbase->cs_mask, &nroot->cs_mask); domainset_copy(pbase->cs_domain, &ndomain); DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask); /* Policy is too restrictive, will not work. */ if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask)) return (EDEADLK); /* * Remove pbase from the freelist in advance, it'll be pushed to * cpuset_ids on success. We assume here that cpuset_create() will not * touch pbase on failure, and we just enqueue it back to the freelist * to remain in a consistent state. */ pbase = LIST_FIRST(cpusets); LIST_REMOVE(pbase, cs_link); error = cpuset_create(&pbase, set, &nmask); if (error != 0) { LIST_INSERT_HEAD(cpusets, pbase, cs_link); return (error); } /* Duplicates some work from above... oh well. */ pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain, domainlist); *nsetp = pbase; return (0); } /* * Handle four cases for updating an entire process. * * 1) Set is non-null and the process is not rebasing onto a new root. This * reparents all anonymous sets to the provided set and replaces all * non-anonymous td_cpusets with the provided set. * 2) Set is non-null and the process is rebasing onto a new root. This * creates a new base set if the process previously had its own base set, * then reparents all anonymous sets either to that set or the provided set * if one was not created. Non-anonymous sets are similarly replaced. * 3) Mask is non-null. This replaces or creates anonymous sets for every * thread with the existing base as a parent. * 4) domain is non-null. This creates anonymous sets for every thread * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, struct domainset *domain, bool rebase) { struct setlist freelist; struct setlist droplist; struct domainlist domainlist; struct cpuset *base, *nset, *nroot, *tdroot; struct thread *td; struct proc *p; int needed; int nfree; int error; /* * The algorithm requires two passes due to locking considerations. * * 1) Lookup the process and acquire the locks in the required order. * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ cpuset_freelist_init(&freelist, 1); domainset_freelist_init(&domainlist, 1); nfree = 1; LIST_INIT(&droplist); nfree = 0; base = set; nroot = NULL; if (set != NULL) nroot = cpuset_getroot(set); for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; tdroot = cpuset_getroot(td->td_cpuset); needed = p->p_numthreads; if (set != NULL && rebase && tdroot != nroot) needed++; if (nfree >= needed) break; PROC_UNLOCK(p); if (nfree < needed) { cpuset_freelist_add(&freelist, needed - nfree); domainset_freelist_add(&domainlist, needed - nfree); nfree = needed; } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * If we're changing roots and the root set is what has been specified * as the parent, then we'll check if the process was previously using * the root set and, if it wasn't, create a new base with the process's * mask applied to it. * * If the new root is incompatible with the existing mask, then we allow * the process to take on the new root if and only if they have * privilege to widen their mask anyways. Unprivileged processes get * rejected with EDEADLK. */ if (set != NULL && rebase && nroot != tdroot) { cpusetid_t base_id, root_id; root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id; base_id = cpuset_getbase(td->td_cpuset)->cs_id; if (base_id != root_id) { error = cpuset_setproc_newbase(td, set, nroot, &base, &freelist, &domainlist); if (error == EDEADLK && priv_check(td, PRIV_SCHED_CPUSET) == 0) error = 0; if (error != 0) goto unlock_out; } } /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_test_setthread(td->td_cpuset, base); else error = cpuset_setproc_test_maskthread(td->td_cpuset, mask, domain); thread_unlock(td); if (error) goto unlock_out; } /* * Replace each thread's cpuset while using deferred release. We * must do this because the thread lock must be held while operating * on the thread and this limits the type of operations allowed. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_setthread(td->td_cpuset, base, &nset, &freelist, &domainlist); else error = cpuset_setproc_maskthread(td->td_cpuset, mask, domain, &nset, &freelist, &domainlist); if (error) { thread_unlock(td); break; } cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); thread_unlock(td); } unlock_out: PROC_UNLOCK(p); out: if (base != NULL && base != set) cpuset_rel(base); while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); cpuset_freelist_free(&freelist); domainset_freelist_free(&domainlist); return (error); } static int bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen) { size_t bytes; int i, once; char *p; once = 0; p = buf; for (i = 0; i < __bitset_words(setlen); i++) { if (once != 0) { if (bufsiz < 1) return (0); *p = ','; p++; bufsiz--; } else once = 1; if (bufsiz < sizeof(__STRING(ULONG_MAX))) return (0); bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]); p += bytes; bufsiz -= bytes; } return (p - buf); } static int bitset_strscan(struct bitset *set, int setlen, const char *buf) { int i, ret; const char *p; BIT_ZERO(setlen, set); p = buf; for (i = 0; i < __bitset_words(setlen); i++) { if (*p == ',') { p++; continue; } ret = sscanf(p, "%lx", &set->__bits[i]); if (ret == 0 || ret == -1) break; while (isxdigit(*p)) p++; } return (p - buf); } /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set, CPU_SETSIZE); return (buf); } /* * Build a valid cpuset_t object from a string representation. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ int cpusetobj_strscan(cpuset_t *set, const char *buf) { char p; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)]; if (p != '\0') return (-1); return (0); } /* * Handle a domainset specifier in the sysctl tree. A poiner to a pointer to * a domainset is in arg1. If the user specifies a valid domainset the * pointer is updated. * * Format is: * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred */ int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS) { char buf[DOMAINSETBUFSIZ]; struct domainset *dset; struct domainset key; int policy, prefer, error; char *p; dset = *(struct domainset **)arg1; error = 0; if (dset != NULL) { p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ, (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE); sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer); } else sprintf(buf, ""); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); /* * Read in and validate the string. */ memset(&key, 0, sizeof(key)); p = &buf[bitset_strscan((struct bitset *)&key.ds_mask, DOMAINSET_SETSIZE, buf)]; if (p == buf) return (EINVAL); if (sscanf(p, ":%d:%d", &policy, &prefer) != 2) return (EINVAL); key.ds_policy = policy; key.ds_prefer = prefer; /* Domainset_create() validates the policy.*/ dset = domainset_create(&key); if (dset == NULL) return (EINVAL); *(struct domainset **)arg1 = dset; return (error); } /* * Apply an anonymous mask or a domain to a single thread. */ static int _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) { struct setlist cpusets; struct domainlist domainlist; struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; cpuset_freelist_init(&cpusets, 1); domainset_freelist_init(&domainlist, domain != NULL); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, &cpusets, &domainlist); if (error == 0) set = cpuset_update_thread(td, nset); thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: cpuset_freelist_free(&cpusets); domainset_freelist_free(&domainlist); return (error); } /* * Apply an anonymous mask to a single thread. */ int cpuset_setthread(lwpid_t id, cpuset_t *mask) { return _cpuset_setthread(id, mask, NULL); } /* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { cpuset_t mask; CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); return _cpuset_setthread(id, &mask, NULL); } /* * Initialize static domainsets after NUMA information is available. This is * called before memory allocators are initialized. */ void domainset_init(void) { struct domainset *dset; int i; dset = &domainset_firsttouch; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; _domainset_create(dset, NULL); dset = &domainset_interleave; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_INTERLEAVE; dset->ds_prefer = -1; _domainset_create(dset, NULL); dset = &domainset_roundrobin; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; dset->ds_prefer = -1; _domainset_create(dset, NULL); for (i = 0; i < vm_ndomains; i++) { dset = &domainset_fixed[i]; DOMAINSET_ZERO(&dset->ds_mask); DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; _domainset_create(dset, NULL); dset = &domainset_prefer[i]; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_PREFER; dset->ds_prefer = i; _domainset_create(dset, NULL); } } /* * Define the domainsets for cpuset 0, 1 and cpuset 2. */ void domainset_zero(void) { struct domainset *dset, *tmp; mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); domainset0 = &domainset_firsttouch; curthread->td_domain.dr_policy = domainset0; domainset2 = &domainset_interleave; kernel_object->domain.dr_policy = domainset2; /* Remove empty domains from the global policies. */ LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp) if (domainset_empty_vm(dset)) LIST_REMOVE(dset, ds_link); } /* * Creates system-wide cpusets and the cpuset for thread0 including three * sets: * * 0 - The root set which should represent all valid processors in the * system. This set is immutable. * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. * 2 - The kernel set which allows restriction and policy to be applied only * to kernel threads and the kernel_object. */ struct cpuset * cpuset_thread0(void) { struct cpuset *set; int i; int error __unused; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); /* * Create the root system set (0) for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); CPU_COPY(&all_cpus, &set->cs_mask); LIST_INIT(&set->cs_children); LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); refcount_init(&set->cs_ref, 1); set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY; set->cs_domain = domainset0; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default (1), modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; /* * Create the kernel set (2). */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, cpuset_zero, NULL, NULL, 2); KASSERT(error == 0, ("Error creating kernel set: %d\n", error)); set->cs_domain = domainset2; cpuset_kernel = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. */ cpuset_unr = new_unrhdr(3, INT_MAX, NULL); /* * If MD code has not initialized per-domain cpusets, place all * CPUs in domain 0. */ for (i = 0; i < MAXMEMDOM; i++) if (!CPU_EMPTY(&cpuset_domain[i])) goto domains_set; CPU_COPY(&all_cpus, &cpuset_domain[0]); domains_set: return (cpuset_default); } void cpuset_kernthread(struct thread *td) { struct cpuset *set; thread_lock(td); set = td->td_cpuset; td->td_cpuset = cpuset_ref(cpuset_kernel); thread_unlock(td); cpuset_rel(set); } /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. * * We are not going to reparent the td to it. Use cpuset_setproc_update_set() * for that. * * In case of no error, returns the set in *setp locked with a reference. */ int cpuset_create_root(struct prison *pr, struct cpuset **setp) { struct cpuset *set; int error; KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); set = NULL; error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data", __func__, __LINE__)); /* Mark the set as root. */ set->cs_flags |= CPU_SET_ROOT; *setp = set; return (0); } int cpuset_setproc_update_set(struct proc *p, struct cpuset *set) { int error; KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); error = cpuset_setproc(p->p_pid, set, NULL, NULL, true); if (error) return (error); cpuset_rel(set); return (0); } /* * In Capability mode, the only accesses that are permitted are to the current * thread and process' CPU and domain sets. */ static int cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id) { if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1 && !(which == CPU_WHICH_TID && id == td->td_tid) && !(which == CPU_WHICH_PID && id == td->td_proc->p_pid)) return (ECAPMODE); } return (0); } static const struct cpuset_copy_cb copy_set = { .cpuset_copyin = copyin, .cpuset_copyout = copyout }; #ifndef _SYS_SYSPROTO_H_ struct cpuset_args { cpusetid_t *setid; }; #endif int sys_cpuset(struct thread *td, struct cpuset_args *uap) { struct cpuset *root; struct cpuset *set; int error; thread_lock(td); root = cpuset_refroot(td->td_cpuset); thread_unlock(td); set = NULL; error = cpuset_create(&set, root, &root->cs_mask); cpuset_rel(root); if (error) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) error = cpuset_setproc(-1, set, NULL, NULL, false); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setid_args { cpuwhich_t which; id_t id; cpusetid_t setid; }; #endif int sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) { return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); } int kern_cpuset_setid(struct thread *td, cpuwhich_t which, id_t id, cpusetid_t setid) { struct cpuset *set; int error; /* * Presently we only support per-process sets. */ if (which != CPU_WHICH_PID) return (EINVAL); set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); error = cpuset_setproc(id, set, NULL, NULL, false); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getid_args { cpulevel_t level; cpuwhich_t which; id_t id; cpusetid_t *setid; }; #endif int sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) { return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, uap->setid)); } int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpusetid_t tmpid; int error; if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) return (EINVAL); error = cpuset_which(which, id, &p, &ttd, &set); if (error) return (error); switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_refbase(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (EINVAL); } switch (level) { case CPU_LEVEL_ROOT: nset = cpuset_refroot(set); cpuset_rel(set); set = nset; break; case CPU_LEVEL_CPUSET: break; case CPU_LEVEL_WHICH: break; } tmpid = set->cs_id; cpuset_rel(set); if (error == 0) error = copyout(&tmpid, setid, sizeof(tmpid)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; cpuset_t *mask; }; #endif int sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) { return (user_cpuset_getaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask, ©_set)); } int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *mask) { struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct proc *p; int error; error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); error = cpuset_which(which, id, &p, &ttd, &set); if (error != 0) return (error); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: return (EINVAL); } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); CPU_COPY(&nset->cs_mask, mask); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); CPU_COPY(&ttd->td_cpuset->cs_mask, mask); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); CPU_OR(mask, mask, &ttd->td_cpuset->cs_mask); thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: CPU_COPY(&set->cs_mask, mask); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_getaffinity(id, which, mask); break; case CPU_WHICH_DOMAIN: if (id < 0 || id >= MAXMEMDOM) error = ESRCH; else CPU_COPY(&cpuset_domain[id], mask); break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); if (error == 0) { if (cpusetsize < howmany(CPU_FLS(mask), NBBY)) return (ERANGE); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcpuset(mask, cpusetsize); #endif } return (error); } int user_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp, const struct cpuset_copy_cb *cb) { cpuset_t *mask; size_t size; int error; mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); size = min(cpusetsize, sizeof(cpuset_t)); error = kern_cpuset_getaffinity(td, level, which, id, size, mask); if (error == 0) { error = cb->cpuset_copyout(mask, maskp, size); if (error != 0) goto out; if (cpusetsize > size) { char *end; char *cp; int rv; end = cp = (char *)&maskp->__bits; end += cpusetsize; cp += size; while (cp != end) { rv = subyte(cp, 0); if (rv == -1) { error = EFAULT; goto out; } cp++; } } } out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; const cpuset_t *mask; }; #endif int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { return (user_cpuset_setaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask, ©_set)); } int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpuset_t *mask) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; int error; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcpuset(mask, sizeof(cpuset_t)); #endif error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); if (CPU_EMPTY(mask)) return (EDEADLK); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: return (EINVAL); } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify(nset, mask); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, mask, NULL, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify(set, mask); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_setaffinity(id, which, mask); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } return (error); } int user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp, const struct cpuset_copy_cb *cb) { cpuset_t *mask; int error; size_t size; size = min(cpusetsize, sizeof(cpuset_t)); mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); error = cb->cpuset_copyin(maskp, mask, size); if (error) goto out; /* * Verify that no high bits are set. */ if (cpusetsize > sizeof(cpuset_t)) { const char *end, *cp; int val; end = cp = (const char *)&maskp->__bits; end += cpusetsize; cp += sizeof(cpuset_t); while (cp != end) { val = fubyte(cp); if (val == -1) { error = EFAULT; goto out; } if (val != 0) { error = EINVAL; goto out; } cp++; } } error = kern_cpuset_setaffinity(td, level, which, id, mask); out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int *policy; }; #endif int sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) { return (kern_cpuset_getdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); } int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp, const struct cpuset_copy_cb *cb) { struct domainset outset; struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct domainset *dset; struct proc *p; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); bzero(&outset, sizeof(outset)); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); domainset_copy(nset->cs_domain, &outset); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); domainset_copy(ttd->td_cpuset->cs_domain, &outset); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); dset = ttd->td_cpuset->cs_domain; /* Show all domains in the proc. */ DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask); /* Last policy wins. */ outset.ds_policy = dset->ds_policy; outset.ds_prefer = dset->ds_prefer; thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: domainset_copy(set->cs_domain, &outset); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); /* * Translate prefer into a set containing only the preferred domain, * not the entire fallback set. */ if (outset.ds_policy == DOMAINSET_POLICY_PREFER) { DOMAINSET_ZERO(&outset.ds_mask); DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask); } DOMAINSET_COPY(&outset.ds_mask, mask); if (error == 0) error = cb->cpuset_copyout(mask, maskp, domainsetsize); if (error == 0) if (suword32(policyp, outset.ds_policy) != 0) error = EFAULT; out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int policy; }; #endif int sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) { return (kern_cpuset_setdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); } int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, const domainset_t *maskp, int policy, const struct cpuset_copy_cb *cb) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; struct domainset domain; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); if (policy <= DOMAINSET_POLICY_INVALID || policy > DOMAINSET_POLICY_MAX) return (EINVAL); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); memset(&domain, 0, sizeof(domain)); mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); error = cb->cpuset_copyin(maskp, mask, domainsetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (domainsetsize > sizeof(domainset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += domainsetsize; cp += sizeof(domainset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } if (DOMAINSET_EMPTY(mask)) { error = EDEADLK; goto out; } DOMAINSET_COPY(mask, &domain.ds_mask); domain.ds_policy = policy; /* * Sanitize the provided mask. */ if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) { error = EINVAL; goto out; } /* Translate preferred policy into a mask and fallback. */ if (policy == DOMAINSET_POLICY_PREFER) { /* Only support a single preferred domain. */ if (DOMAINSET_COUNT(&domain.ds_mask) != 1) { error = EINVAL; goto out; } domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; /* This will be constrained by domainset_shadow(). */ DOMAINSET_COPY(&all_domains, &domain.ds_mask); } /* * When given an impossible policy, fall back to interleaving * across all domains. */ if (domainset_empty_vm(&domain)) domainset_copy(domainset2, &domain); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify_domain(nset, &domain); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = _cpuset_setthread(id, NULL, &domain); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, NULL, &domain, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify_domain(set, &domain); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifdef DDB static void ddb_display_bitset(const struct bitset *set, int size) { int bit, once; for (once = 0, bit = 0; bit < size; bit++) { if (CPU_ISSET(bit, set)) { if (once == 0) { db_printf("%d", bit); once = 1; } else db_printf(",%d", bit); } } if (once == 0) db_printf(""); } void ddb_display_cpuset(const cpuset_t *set) { ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); } static void ddb_display_domainset(const domainset_t *set) { ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); } -DB_SHOW_COMMAND(cpusets, db_show_cpusets) +DB_SHOW_COMMAND_FLAGS(cpusets, db_show_cpusets, DB_CMD_MEMSAFE) { struct cpuset *set; LIST_FOREACH(set, &cpuset_ids, cs_link) { db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); db_printf(" cpu mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); db_printf(" domain policy %d prefer %d mask=", set->cs_domain->ds_policy, set->cs_domain->ds_prefer); ddb_display_domainset(&set->cs_domain->ds_mask); db_printf("\n"); if (db_pager_quit) break; } } -DB_SHOW_COMMAND(domainsets, db_show_domainsets) +DB_SHOW_COMMAND_FLAGS(domainsets, db_show_domainsets, DB_CMD_MEMSAFE) { struct domainset *set; LIST_FOREACH(set, &cpuset_domains, ds_link) { db_printf("set=%p policy %d prefer %d cnt %d\n", set, set->ds_policy, set->ds_prefer, set->ds_cnt); db_printf(" mask ="); ddb_display_domainset(&set->ds_mask); db_printf("\n"); } } #endif /* DDB */ diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 163864ddf752..24ab81693e4c 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,5277 +1,5277 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; __read_mostly uma_zone_t pwd_zone; VFS_SMR_DECLARE; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit); static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags); static int fd_first_free(struct filedesc *fdp, int low, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); static struct pwd *pwd_alloc(void); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) #define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \ struct filedesc *_fdp = (fdp); \ int _lastfile = fdlastfile_single(_fdp); \ for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL) #define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \ struct filedesc *_fdp = (fdp); \ int _lastfile = fdlastfile_single(_fdp); \ for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ static int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the last used fd. * * Call this variant if fdp can't be modified by anyone else (e.g, during exec). * Otherwise use fdlastfile. */ int fdlastfile_single(struct filedesc *fdp) { NDSLOTTYPE *map = fdp->fd_map; int off, minoff; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } int fdlastfile(struct filedesc *fdp) { FILEDESC_LOCK_ASSERT(fdp); return (fdlastfile_single(fdp)); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd == fdp->fd_freefile) fdp->fd_freefile++; } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif fdefree_last(fde); fdunused(fdp, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = getmaxfd(td); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; struct mount *mp; struct kinfo_file *kif; int error, flg, kif_sz, seals, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_noref(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_noref(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; if (fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error != 0) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: flp = (struct flock *)arg; if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { error = EINVAL; break; } error = fget_unlocked(td, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(td, fd, &cap_no_rights, &fp2); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(td, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_ADD_SEALS: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; error = fo_add_seals(fp, arg); fdrop(fp, td); break; case F_GET_SEALS: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fo_get_seals(fp, &seals) == 0) td->td_retval[0] = seals; else error = EINVAL; fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; if (vp->v_type != VREG) { fdrop(fp, td); error = ENOTTY; break; } /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; arg = MIN(arg, INT_MAX - bsize + 1); fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, (arg + bsize - 1) / bsize); atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp); fdrop(fp, td); break; case F_ISUNIONSTACK: /* * Check if the vnode is part of a union stack (either the * "union" flag from mount(2) or unionfs). * * Prior to introduction of this op libc's readdir would call * fstatfs(2), in effect unnecessarily copying kilobytes of * data just to check fs name and a mount flag. * * Fixing the code to handle everything in the kernel instead * is a non-trivial endeavor and has low priority, thus this * horrible kludge facilitates the current behavior in a much * cheaper manner until someone(tm) sorts this out. */ error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Since we don't prevent dooming the vnode even non-null mp * found can become immediately stale. This is tolerable since * mount points are type-stable (providing safe memory access) * and any vfs op on this vnode going forward will return an * error (meaning return value in this case is meaningless). */ mp = atomic_load_ptr(&vp->v_mount); if (__predict_false(mp == NULL)) { fdrop(fp, td); error = EBADF; break; } td->td_retval[0] = 0; if (mp->mnt_kern_flag & MNTK_UNIONFS || mp->mnt_flag & MNT_UNION) td->td_retval[0] = 1; fdrop(fp, td); break; case F_KINFO: #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; break; } #endif error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); if (error != 0) break; if (kif_sz != sizeof(*kif)) { error = EINVAL; break; } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); FILEDESC_SLOCK(fdp); error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); if (error == 0 && fhold(fp)) { export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); FILEDESC_SUNLOCK(fdp); fdrop(fp, td); if ((kif->kf_status & KF_ATTR_VALID) != 0) { kif->kf_structsize = sizeof(*kif); error = copyout(kif, (void *)arg, sizeof(*kif)); } else { error = EBADF; } } else { FILEDESC_SUNLOCK(fdp); if (error == 0) error = EBADF; } free(kif, M_TEMP); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp, *oldfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; oioctls = NULL; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_noref(fdp, old) == NULL) goto unlock; if (mode == FDDUP_FIXED && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } oldfde = &fdp->fd_ofiles[old]; oldfp = oldfde->fde_file; if (!fhold(oldfp)) goto unlock; /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) { fdrop(oldfp, td); goto unlock; } break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); if (error != 0) { error = EMFILE; fdrop(oldfp, td); goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); /* Refetch oldfde because the table may have grown and old one freed. */ oldfde = &fdp->fd_ofiles[old]; KASSERT(oldfp == oldfde->fde_file, ("fdt_ofiles shift from growth observed at fd %d", old)); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif oioctls = filecaps_free_prep(&newfde->fde_caps); fde_copy(oldfde, newfde); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, true, false); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } filecaps_free_finish(oioctls); return (error); } static void sigiofree(struct sigio *sigio) { crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } static struct sigio * funsetown_locked(struct sigio *sigio) { struct proc *p; struct pgrp *pg; SIGIO_ASSERT_LOCKED(); if (sigio == NULL) return (NULL); *sigio->sio_myref = NULL; if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { p = sigio->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } return (sigio); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; /* Racy check, consumers must provide synchronization. */ if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = funsetown_locked(*sigiop); SIGIO_UNLOCK(); if (sigio != NULL) sigiofree(sigio); } /* * Free a list of sigio structures. The caller must ensure that new sigio * structures cannot be added after this point. For process groups this is * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves * as an interlock. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio, *tmp; /* Racy check. */ sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; SIGIO_LOCK(); sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) { SIGIO_UNLOCK(); return; } /* * Every entry of the list should belong to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK(pg); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK(p); KASSERT((p->p_flag & P_WEXIT) != 0, ("%s: process %p is not exiting", __func__, p)); } SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { *sigio->sio_myref = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); } } if (pg != NULL) PGRP_UNLOCK(pg); else PROC_UNLOCK(p); SIGIO_UNLOCK(); SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) sigiofree(sigio); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *osigio, *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; ret = 0; if (pgid > 0) { ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); if (ret == 0) { PROC_LOCK(proc); _PRELE(proc); if ((proc->p_flag & P_WEXIT) != 0) { ret = ESRCH; } else if (proc->p_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_proc = proc; SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); } PROC_UNLOCK(proc); } } else /* if (pgid < 0) */ { sx_slock(&proctree_lock); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; } else { if (pgrp->pg_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_pgrp = pgrp; SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); } PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); } if (ret == 0) *sigiop = sigio; SIGIO_UNLOCK(); if (osigio != NULL) sigiofree(osigio); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(struct sigio **sigiop) { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } static int closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (__predict_false(fp->f_type == DTYPE_MQUEUE)) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); #ifdef AUDIT if (AUDITING_TD(td) && audit) audit_sysclose(td, fd, fp); #endif error = closef(fp, td); /* * All paths leading up to closefp() will have already removed or * replaced the fd in the filedesc table, so a restart would not * operate on the same file. */ if (error == ERESTART) error = EINTR; return (error); } static int closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = false; } } error = closefp_impl(fdp, fd, fp, td, audit); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { FILEDESC_XLOCK_ASSERT(fdp); if (__predict_false(td->td_proc->p_fdtol != NULL)) { return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); } else { return (closefp_impl(fdp, fd, fp, td, audit)); } } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if ((fp = fget_noref(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, true, true)); } static int close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; struct fdescenttbl *fdt; struct filedescent *fde; int fd; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (; fd <= highfd; fd++) { fde = &fdt->fdt_ofiles[fd]; if (fde->fde_file != NULL) fde->fde_flags |= UF_EXCLOSE; } out_locked: FILEDESC_XUNLOCK(fdp); return (0); } static int close_range_impl(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; int fd; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (fp == NULL) { if (fd == highfd) goto out_locked; } else { fdfree(fdp, fd); (void) closefp(fdp, fd, fp, td, true, true); if (fd == highfd) goto out_unlocked; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); } fd++; } out_locked: FILEDESC_XUNLOCK(fdp); out_unlocked: return (0); } int kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) { /* * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 * open should not be a usage error. From a close_range() perspective, * close_range(3, ~0U, 0) in the same scenario should also likely not * be a usage error as all fd above 3 are in-fact already closed. */ if (highfd < lowfd) { return (EINVAL); } if ((flags & CLOSE_RANGE_CLOEXEC) != 0) return (close_range_cloexec(td, lowfd, highfd)); return (close_range_impl(td, lowfd, highfd)); } #ifndef _SYS_SYSPROTO_H_ struct close_range_args { u_int lowfd; u_int highfd; int flags; }; #endif int sys_close_range(struct thread *td, struct close_range_args *uap) { AUDIT_ARG_FD(uap->lowfd); AUDIT_ARG_CMD(uap->highfd); AUDIT_ARG_FFLAGS(uap->flags); if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) return (EINVAL); return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); } #ifdef COMPAT_FREEBSD12 /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd12_closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) { u_int lowfd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ lowfd = MAX(0, uap->lowfd); return (kern_close_range(td, 0, lowfd, ~0U)); } #endif /* COMPAT_FREEBSD12 */ #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (__predict_false(error != 0)) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error != 0) return (error); error = freebsd11_cvtnstat(&ub, &nub); if (error != 0) error = copyout(&nub, uap->sb, sizeof(nub)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; int error; error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (__predict_true(src->fc_ioctls == NULL)) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (__predict_true(src->fc_ioctls == NULL)) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ static void filecaps_free_ioctl(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); fcaps->fc_ioctls = NULL; } void filecaps_free(struct filecaps *fcaps) { filecaps_free_ioctl(fcaps); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); /* * open calls without WANTIOCTLCAPS free caps but leave the counter */ #if 0 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); #endif KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Free the old file table when not shared by other threads or processes. * The old file table is considered to be shared when either are true: * - The process has more than one thread. * - The file descriptor table has been shared via fdshare(). * * When shared, the old file table will be placed on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { /* * Note we may be called here from fdinit while allocating a * table for a new process in which case ->p_fd points * elsewhere. */ if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { free(otable, M_FILEDESC); } else { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (__predict_false(fd >= maxfd)) return (EMFILE); if (__predict_false(fd >= fdp->fd_nfiles)) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; MPASS(resultfp != NULL); MPASS(resultfd != NULL); error = _falloc_noinstall(td, &fp, 2); if (__predict_false(error != 0)) { return (error); } error = finstall_refed(td, fp, &fd, flags, fcaps); if (__predict_false(error != 0)) { falloc_abort(td, fp); return (error); } *resultfp = fp; *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); MPASS(n > 0); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK); bzero(fp, sizeof(*fp)); refcount_init(&fp->f_count, n); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } void falloc_abort(struct thread *td, struct file *fp) { /* * For assertion purposes. */ refcount_init(&fp->f_count, 0); _fdrop(fp, td); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif } int finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); error = fdalloc(td, 0, fd); if (__predict_true(error == 0)) { _finstall(fdp, fp, *fd, flags, fcaps); } FILEDESC_XUNLOCK(fdp); return (error); } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { int error; MPASS(fd != NULL); if (!fhold(fp)) return (EBADF); error = finstall_refed(td, fp, fd, flags, fcaps); if (__predict_false(error != 0)) { fdrop(fp, td); } return (error); } /* * Build a new filedesc structure from another. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(void) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; return (newfdp); } /* * Build a pwddesc structure from another. * Copy the current, root, and jail root vnode references. * * If pdp is not NULL, return with it shared locked. */ struct pwddesc * pdinit(struct pwddesc *pdp, bool keeplock) { struct pwddesc *newpdp; struct pwd *newpwd; newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); PWDDESC_LOCK_INIT(newpdp); refcount_init(&newpdp->pd_refcount, 1); newpdp->pd_cmask = CMASK; if (pdp == NULL) { newpwd = pwd_alloc(); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); return (newpdp); } PWDDESC_XLOCK(pdp); newpwd = pwd_hold_pwddesc(pdp); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); if (!keeplock) PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Hold either filedesc or pwddesc of the passed process. * * The process lock is used to synchronize against the target exiting and * freeing the data. * * Clearing can be ilustrated in 3 steps: * 1. set the pointer to NULL. Either routine can race against it, hence * atomic_load_ptr. * 2. observe the process lock as not taken. Until then fdhold/pdhold can * race to either still see the pointer or find NULL. It is still safe to * grab a reference as clearing is stalled. * 3. after the lock is observed as not taken, any fdhold/pdhold calls are * guaranteed to see NULL, making it safe to finish clearing */ static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = atomic_load_ptr(&p->p_fd); if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static struct pwddesc * pdhold(struct proc *p) { struct pwddesc *pdp; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = atomic_load_ptr(&p->p_pd); if (pdp != NULL) refcount_acquire(&pdp->pd_refcount); return (pdp); } static void fddrop(struct filedesc *fdp) { if (refcount_load(&fdp->fd_holdcnt) > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } static void pddrop(struct pwddesc *pdp) { struct pwd *pwd; if (refcount_release_if_not_last(&pdp->pd_refcount)) return; PWDDESC_XLOCK(pdp); if (refcount_release(&pdp->pd_refcount) == 0) { PWDDESC_XUNLOCK(pdp); return; } pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_set(pdp, NULL); PWDDESC_XUNLOCK(pdp); pwd_drop(pwd); PWDDESC_LOCK_DESTROY(pdp); free(pdp, M_PWDDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Share a pwddesc structure. */ struct pwddesc * pdshare(struct pwddesc *pdp) { refcount_acquire(&pdp->pd_refcount); return (pdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } /* * Unshare a pwddesc structure. */ void pdunshare(struct thread *td) { struct pwddesc *pdp; struct proc *p; p = td->td_proc; /* Not shared. */ if (refcount_load(&p->p_pd->pd_refcount) == 1) return; pdp = pdcopy(p->p_pd); pdescfree(td); p->p_pd = pdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i, lastfile; MPASS(fdp != NULL); newfdp = fdinit(); FILEDESC_SLOCK(fdp); for (;;) { lastfile = fdlastfile(fdp); if (lastfile < newfdp->fd_nfiles) break; FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, lastfile + 1); FILEDESC_SLOCK(fdp); } /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = fdp->fd_freefile; FILEDESC_FOREACH_FDE(fdp, i, ofde) { if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == fdp->fd_freefile) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } MPASS(newfdp->fd_freefile != -1); FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copy a pwddesc structure. */ struct pwddesc * pdcopy(struct pwddesc *pdp) { struct pwddesc *newpdp; MPASS(pdp != NULL); newpdp = pdinit(pdp, true); newpdp->pd_cmask = pdp->pd_cmask; PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { FILEDESC_FOREACH_FP(fdp, i, fp) { if (fp->f_type != DTYPE_VNODE || !fhold(fp)) continue; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i; KASSERT(refcount_load(&fdp->fd_refcnt) == 0, ("%s: fd table %p carries references", __func__, fdp)); /* * Serialize with threads iterating over the table, if any. */ if (refcount_load(&fdp->fd_holdcnt) > 1) { FILEDESC_XLOCK(fdp); FILEDESC_XUNLOCK(fdp); } FILEDESC_FOREACH_FDE(fdp, i, fde) { fp = fde->fde_file; fdefree_last(fde); (void) closef(fp, td); } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (RACCT_ENABLED()) racct_set_unlocked(p, RACCT_NOFILE, 0); #endif if (p->p_fdtol != NULL) fdclearlocks(td); /* * Check fdhold for an explanation. */ atomic_store_ptr(&p->p_fd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; fdescfree_fds(td, fdp); } void pdescfree(struct thread *td) { struct proc *p; struct pwddesc *pdp; p = td->td_proc; pdp = p->p_pd; MPASS(pdp != NULL); /* * Check pdhold for an explanation. */ atomic_store_ptr(&p->p_pd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); pddrop(pdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); FILEDESC_FOREACH_FDE(fdp, i, fde) { fp = fde->fde_file; if (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE)) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, false, false); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; MPASS(td != NULL); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop_close(fp, td)); } /* * Hack for file descriptor passing code. */ void closef_nothread(struct file *fp) { fdrop(fp, NULL); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } void finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) { fp->f_seqcount[UIO_READ] = 1; fp->f_seqcount[UIO_WRITE] = 1; finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, data, ops); } int fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); *fpp = NULL; fde = fdeget_noref(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde_inline(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } #ifdef CAPABILITIES int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; struct file *fp; seqc_t seq; *fpp = NULL; for (;;) { error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0 && !fhold(*fpp)) error = EBADF; FILEDESC_SUNLOCK(fdp); return (error); } #else int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { int error; error = fget_unlocked(td, fd, needrightsp, fpp); if (havecapsp != NULL && error == 0) filecaps_fill(havecapsp); return (error); } #endif #ifdef CAPABILITIES int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct filedescent *fde; const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; const cap_rights_t *haverights; cap_rights_t rights; seqc_t seq; VFS_SMR_ASSERT_ENTERED(); rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); if (__predict_false(cap_check_inline_transient(haverights, &rights))) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL)) { return (EAGAIN); } if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) return (EAGAIN); /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. * * Not yet supported by fast path. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { #ifdef notyet ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; #else return (EAGAIN); #endif } *vpp = vp; return (0); } #else int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; VFS_SMR_ASSERT_ENTERED(); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) return (EAGAIN); filecaps_fill(&ndp->ni_filecaps); *vpp = vp; return (0); } #endif int fgetvp_lookup(int fd, struct nameidata *ndp, struct vnode **vpp) { struct thread *td; struct file *fp; struct vnode *vp; struct componentname *cnp; cap_rights_t rights; int error; td = curthread; rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); cnp = &ndp->ni_cnd; error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { error = EBADF; goto out_free; } vp = fp->f_vnode; if (__predict_false(vp == NULL)) { error = ENOTDIR; goto out_free; } vrefact(vp); /* * XXX does not check for VDIR, handled by namei_setup */ if ((fp->f_flag & FSEARCH) != 0) cnp->cn_flags |= NOEXECCHECK; fdrop(fp, td); #ifdef CAPABILITIES /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; ndp->ni_resflags |= NIRES_STRICTREL; } #endif /* * TODO: avoid copying ioctl caps if it can be helped to begin with */ if ((cnp->cn_flags & WANTIOCTLCAPS) == 0) filecaps_free_ioctl(&ndp->ni_filecaps); *vpp = vp; return (0); out_free: filecaps_free(&ndp->ni_filecaps); fdrop(fp, td); return (error); } /* * Fetch the descriptor locklessly. * * We avoid fdrop() races by never raising a refcount above 0. To accomplish * this we have to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain that the * identity is still correct and we did not lose a race due to preemption. * * Force a reload of fdt when looping. Another thread could reallocate * the table before this fd was closed, so it is possible that there is * a stale fp pointer in cached version. */ #ifdef CAPABILITIES static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp) { struct filedesc *fdp; const struct filedescent *fde; const struct fdescenttbl *fdt; struct file *fp; seqc_t seq; cap_rights_t haverights; int error; fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); for (;;) { seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) { if (seqc_consistent(fd_seqc(fdt, fd), seq)) return (EBADF); fdt = atomic_load_ptr(&fdp->fd_files); continue; } error = cap_check_inline(&haverights, needrightsp); if (__predict_false(error != 0)) { if (seqc_consistent(fd_seqc(fdt, fd), seq)) return (error); fdt = atomic_load_ptr(&fdp->fd_files); continue; } if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) break; fdrop(fp, td); } *fpp = fp; if (seqp != NULL) { *seqp = seq; } return (0); } #else static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp __unused) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) break; fdrop(fp, td); } *fpp = fp; return (0); } #endif /* * See the comments in fget_unlocked_seq for an explanation of how this works. * * This is a simplified variant which bails out to the aforementioned routine * if anything goes wrong. In practice this only happens when userspace is * racing with itself. */ int fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct filedesc *fdp; #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; const cap_rights_t *haverights; #endif fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { *fpp = NULL; return (EBADF); } #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (__predict_false(fp == NULL)) goto out_fallback; #ifdef CAPABILITIES if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) goto out_fallback; #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) goto out_fallback; /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) #else if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) #endif goto out_fdrop; *fpp = fp; return (0); out_fdrop: fdrop(fp, td); out_fallback: *fpp = NULL; return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); } /* * Translate fd -> file when the caller guarantees the file descriptor table * can't be changed by others. * * Note this does not mean the file object itself is only visible to the caller, * merely that it wont disappear without having to be referenced. * * Must be paired with fput_only_user. */ #ifdef CAPABILITIES int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { const struct filedescent *fde; const struct fdescenttbl *fdt; const cap_rights_t *haverights; struct file *fp; int error; MPASS(FILEDESC_IS_ONLY_USER(fdp)); *fpp = NULL; if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fdt = fdp->fd_files; fde = &fdt->fdt_ofiles[fd]; fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); haverights = cap_rights_fde_inline(fde); error = cap_check_inline(haverights, needrightsp); if (__predict_false(error != 0)) return (error); *fpp = fp; return (0); } #else int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct file *fp; MPASS(FILEDESC_IS_ONLY_USER(fdp)); *fpp = NULL; if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fp = fdp->fd_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); *fpp = fp; return (0); } #endif /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp) { struct file *fp; int error; *fpp = NULL; error = fget_unlocked(td, fd, needrightsp, &fp); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if (fp->f_ops != &path_fileops && ((fp->f_flag & (FREAD | FEXEC)) == 0 || (fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; return (error); #else cap_rights_t fdrights; struct filedesc *fdp; struct file *fp; seqc_t seq; *fpp = NULL; fdp = td->td_proc->p_fd; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } if (maxprotp != NULL) fdrights = *cap_rights(fdp, fd); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(&fdrights); *fpp = fp; return (0); #endif } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { #ifndef CAPABILITIES return (fget_unlocked(td, fd, rightsp, fpp)); #else struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; int error; seqc_t seq; *fpp = NULL; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vrefact(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filecaps caps; struct file *fp; int error; error = fget_cap(td, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vrefact(*vpp); fdrop(fp, td); return (0); out: filecaps_free(&caps); fdrop(fp, td); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. * * Without the noinline attribute clang keeps inlining the func thorough this * file when fdrop is used. */ int __noinline _fdrop(struct file *fp, struct thread *td) { int error; #ifdef INVARIANTS int count; count = refcount_load(&fp->f_count); if (count != 0) panic("fdrop: fp %p count %d", fp, count); #endif error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); error = EOPNOTSUPP; if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { goto done; } if (fp->f_ops == &path_fileops) { goto done; } error = 0; vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_noref(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } if (!fhold(fp)) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EBADF); } newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif fde_copy(oldfde, newfde); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seqc_write_begin(&oldfde->fde_seqc); seqc_write_begin(&newfde->fde_seqc); #endif fde_copy(oldfde, newfde); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); seqc_write_end(&oldfde->fde_seqc); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int i; FILEDESC_LOCK_ASSERT(fdp); FILEDESC_FOREACH_FP(fdp, i, fp) { if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } static void pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) { if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { vrefact(oldpwd->pwd_cdir); newpwd->pwd_cdir = oldpwd->pwd_cdir; } if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { vrefact(oldpwd->pwd_rdir); newpwd->pwd_rdir = oldpwd->pwd_rdir; } if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { vrefact(oldpwd->pwd_jdir); newpwd->pwd_jdir = oldpwd->pwd_jdir; } } struct pwd * pwd_hold_pwddesc(struct pwddesc *pdp) { struct pwd *pwd; PWDDESC_ASSERT_XLOCKED(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (pwd != NULL) refcount_acquire(&pwd->pwd_refcount); return (pwd); } bool pwd_hold_smr(struct pwd *pwd) { MPASS(pwd != NULL); if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { return (true); } return (false); } struct pwd * pwd_hold(struct thread *td) { struct pwddesc *pdp; struct pwd *pwd; pdp = td->td_proc->p_pd; vfs_smr_enter(); pwd = vfs_smr_entered_load(&pdp->pd_pwd); if (pwd_hold_smr(pwd)) { vfs_smr_exit(); return (pwd); } vfs_smr_exit(); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); return (pwd); } struct pwd * pwd_hold_proc(struct proc *p) { struct pwddesc *pdp; struct pwd *pwd; PROC_ASSERT_HELD(p); PROC_LOCK(p); pdp = pdhold(p); MPASS(pdp != NULL); PROC_UNLOCK(p); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); pddrop(pdp); return (pwd); } static struct pwd * pwd_alloc(void) { struct pwd *pwd; pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); bzero(pwd, sizeof(*pwd)); refcount_init(&pwd->pwd_refcount, 1); return (pwd); } void pwd_drop(struct pwd *pwd) { if (!refcount_release(&pwd->pwd_refcount)) return; if (pwd->pwd_cdir != NULL) vrele(pwd->pwd_cdir); if (pwd->pwd_rdir != NULL) vrele(pwd->pwd_rdir); if (pwd->pwd_jdir != NULL) vrele(pwd->pwd_jdir); uma_zfree_smr(pwd_zone, pwd); } /* * The caller is responsible for invoking priv_check() and * mac_vnode_check_chroot() to authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && oldpwd->pwd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } } else { FILEDESC_SUNLOCK(fdp); } vrefact(vp); newpwd->pwd_rdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; VNPASS(vp->v_usecount > 0, vp); newpwd = pwd_alloc(); pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); newpwd->pwd_cdir = vp; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * jail_attach(2) changes both root and working directories. */ int pwd_chroot_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } vrefact(vp); newpwd->pwd_rdir = vp; vrefact(vp); newpwd->pwd_cdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_ensure_dirs(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) { PWDDESC_XUNLOCK(pdp); return; } PWDDESC_XUNLOCK(pdp); newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_fill(oldpwd, newpwd); if (newpwd->pwd_cdir == NULL) { vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; } if (newpwd->pwd_rdir == NULL) { vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; } pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } void pwd_set_rootvnode(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; newpwd = pwd_alloc(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) continue; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd == NULL || (oldpwd->pwd_cdir != olddp && oldpwd->pwd_rdir != olddp && oldpwd->pwd_jdir != olddp)) { PWDDESC_XUNLOCK(pdp); pddrop(pdp); continue; } if (oldpwd->pwd_cdir == olddp) { vrefact(newdp); newpwd->pwd_cdir = newdp; } if (oldpwd->pwd_rdir == olddp) { vrefact(newdp); newpwd->pwd_rdir = newdp; } if (oldpwd->pwd_jdir == olddp) { vrefact(newdp); newpwd->pwd_jdir = newdp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); pddrop(pdp); newpwd = pwd_alloc(); } sx_sunlock(&allproc_lock); pwd_drop(newpwd); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { NDSLOTTYPE *map; struct filedesc *fdp; u_int namelen; int count, off, minoff; namelen = arg2; if (namelen != 1) return (EINVAL); if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); map = fdp->fd_map; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) count += bitcountl(map[off]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ n += fdp->fd_nfiles; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto nextproc; FILEDESC_FOREACH_FP(fdp, n, fp) { xf.xf_fd = n; xf.xf_file = (uintptr_t)fp; xf.xf_data = (uintptr_t)fp->f_data; xf.xf_vnode = (uintptr_t)fp->f_vnode; xf.xf_type = (uintptr_t)fp->f_type; xf.xf_count = refcount_load(&fp->f_count); xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); /* * There is no need to re-check the fdtable refcount * here since the filedesc lock is not dropped in the * loop body. */ if (error != 0) break; } nextproc: FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = refcount_load(&fp->f_count); kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct pwddesc *pdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) return (ENOMEM); efbuf->remainder -= kif->kf_structsize; } if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) return (sbuf_error(efbuf->sb)); return (0); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (ENOMEM); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (ENOMEM); if (efbuf->pdp != NULL) PWDDESC_XUNLOCK(efbuf->pdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->pdp != NULL) PWDDESC_XLOCK(efbuf->pdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct pwddesc *pdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; struct pwd *pwd; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = ktr_get_tracevp(p, true); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); pdp = pdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; error = 0; if (tracevp != NULL) error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (error == 0 && textvp != NULL) error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (error == 0 && cttyvp != NULL) error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); if (error != 0 || pdp == NULL || fdp == NULL) goto fail; efbuf->fdp = fdp; efbuf->pdp = pdp; PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { /* working directory */ if (pwd->pwd_cdir != NULL) { vrefact(pwd->pwd_cdir); error = export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (error == 0 && pwd->pwd_rdir != NULL) { vrefact(pwd->pwd_rdir); error = export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (error == 0 && pwd->pwd_jdir != NULL) { vrefact(pwd->pwd_jdir); error = export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } } PWDDESC_XUNLOCK(pdp); if (error != 0) goto fail; if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto skip; FILEDESC_FOREACH_FP(fdp, i, fp) { #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) break; } skip: FILEDESC_SUNLOCK(fdp); fail: if (fdp != NULL) fddrop(fdp); if (pdp != NULL) pddrop(pdp); free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) { int error; vrefact(vp); PWDDESC_XUNLOCK(pdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); PWDDESC_XLOCK(pdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; struct pwddesc *pdp; struct pwd *pwd; u_int namelen; int error, i, *name; struct file *fp; struct proc *p; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); if (fdp != NULL) pdp = pdhold(p); PROC_UNLOCK(p); if (fdp == NULL || pdp == NULL) { if (fdp != NULL) fddrop(fdp); return (ENOENT); } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { if (pwd->pwd_cdir != NULL) export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, okif, pdp, req); if (pwd->pwd_rdir != NULL) export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, okif, pdp, req); if (pwd->pwd_jdir != NULL) export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, okif, pdp, req); } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto skip; FILEDESC_FOREACH_FP(fdp, i, fp) { export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) break; } skip: FILEDESC_SUNLOCK(fdp); fddrop(fdp); pddrop(pdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct pwddesc *pdp; struct pwd *pwd; struct export_fd_buf *efbuf; struct vnode *cdir; int error; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = pdp; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = 0; PWDDESC_XLOCK(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); cdir = pwd->pwd_cdir; if (cdir == NULL) { error = EINVAL; } else { vrefact(cdir); error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } PWDDESC_XUNLOCK(pdp); pddrop(pdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_EVENTFD: return ("eventfd"); case DTYPE_LINUXTFD: return ("ltimer"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } -DB_SHOW_COMMAND(files, db_show_files) +DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); /* * XXXMJG this is a temporary hack due to boot ordering issues against * the vnode zone. */ vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; static int path_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (POLLNVAL); } static int path_close(struct file *fp, struct thread *td) { MPASS(fp->f_type == DTYPE_VNODE); fp->f_ops = &badfileops; vrele(fp->f_vnode); return (0); } struct fileops path_fileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = path_poll, .fo_kqfilter = vn_kqfilter_opath, .fo_stat = vn_statfile, .fo_close = path_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = vn_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index 3c4f9d0eac23..ead08b530dc3 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -1,1703 +1,1703 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif /* * Describe an interrupt thread. There is one of these per interrupt event. */ struct intr_thread { struct intr_event *it_event; struct thread *it_thread; /* Kernel thread. */ int it_flags; /* (j) IT_* flags. */ int it_need; /* Needs service. */ int it_waiting; /* Waiting in the runq. */ }; /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ #define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; uintptr_t event; }; struct intr_event *clk_intr_event; struct intr_event *tty_intr_event; struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static int intr_storm_threshold = 0; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); static int intr_epoch_batch = 1000; SYSCTL_INT(_hw, OID_AUTO, intr_epoch_batch, CTLFLAG_RWTUN, &intr_epoch_batch, 0, "Maximum interrupt handler executions without re-entering epoch(9)"); #ifdef HWPMC_HOOKS static int intr_hwpmc_waiting_report_threshold = 1; SYSCTL_INT(_hw, OID_AUTO, intr_hwpmc_waiting_report_threshold, CTLFLAG_RWTUN, &intr_hwpmc_waiting_report_threshold, 1, "Threshold for reporting number of events in a workq"); #define PMC_HOOK_INSTALLED_ANY() __predict_false(pmc_hook != NULL) #endif static TAILQ_HEAD(, intr_event) event_list = TAILQ_HEAD_INITIALIZER(event_list); static struct mtx event_lock; MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF); static void intr_event_update(struct intr_event *ie); static int intr_event_schedule_thread(struct intr_event *ie, struct trapframe *frame); static struct intr_thread *ithread_create(const char *name); static void ithread_destroy(struct intr_thread *ithread); static void ithread_execute_handlers(struct proc *p, struct intr_event *ie); static void ithread_loop(void *); static void ithread_update(struct intr_thread *ithd); static void start_softintr(void *); #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , intr, all); PMC_SOFT_DEFINE( , , intr, ithread); PMC_SOFT_DEFINE( , , intr, filter); PMC_SOFT_DEFINE( , , intr, stray); PMC_SOFT_DEFINE( , , intr, schedule); PMC_SOFT_DEFINE( , , intr, waiting); #define PMC_SOFT_CALL_INTR_HLPR(event, frame) \ do { \ if (frame != NULL) \ PMC_SOFT_CALL_TF( , , intr, event, frame); \ else \ PMC_SOFT_CALL( , , intr, event); \ } while (0) #endif /* Map an interrupt type to an ithread priority. */ u_char intr_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTY; break; case INTR_TYPE_BIO: pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; break; case INTR_TYPE_AV: pri = PI_AV; break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("intr_priority: no interrupt type in flags"); } return pri; } /* * Update an ithread based on the associated intr_event. */ static void ithread_update(struct intr_thread *ithd) { struct intr_event *ie; struct thread *td; u_char pri; ie = ithd->it_event; td = ithd->it_thread; mtx_assert(&ie->ie_lock, MA_OWNED); /* Determine the overall priority of this event. */ if (CK_SLIST_EMPTY(&ie->ie_handlers)) pri = PRI_MAX_ITHD; else pri = CK_SLIST_FIRST(&ie->ie_handlers)->ih_pri; /* Update name and priority. */ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif thread_lock(td); sched_ithread_prio(td, pri); thread_unlock(td); } /* * Regenerate the full name of an interrupt event and update its priority. */ static void intr_event_update(struct intr_event *ie) { struct intr_handler *ih; char *last; int missed, space, flags; /* Start off with no entropy and just the name of the event. */ mtx_assert(&ie->ie_lock, MA_OWNED); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); flags = 0; missed = 0; space = 1; /* Run through all the handlers updating values. */ CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 < sizeof(ie->ie_fullname)) { strcat(ie->ie_fullname, " "); strcat(ie->ie_fullname, ih->ih_name); space = 0; } else missed++; flags |= ih->ih_flags; } ie->ie_hflags = flags; /* * If there is only one handler and its name is too long, just copy in * as much of the end of the name (includes the unit number) as will * fit. Otherwise, we have multiple handlers and not all of the names * will fit. Add +'s to indicate missing names. If we run out of room * and still have +'s to add, change the last character from a + to a *. */ if (missed == 1 && space == 1) { ih = CK_SLIST_FIRST(&ie->ie_handlers); missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 - sizeof(ie->ie_fullname); strcat(ie->ie_fullname, (missed == 0) ? " " : "-"); strcat(ie->ie_fullname, &ih->ih_name[missed]); missed = 0; } last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { if (*last == '+') { *last = '*'; break; } else *last = '+'; } else if (space) { strcat(ie->ie_fullname, " +"); space = 0; } else strcat(ie->ie_fullname, "+"); } /* * If this event has an ithread, update it's priority and * name. */ if (ie->ie_thread != NULL) ithread_update(ie->ie_thread); CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname); } int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) { struct intr_event *ie; va_list ap; /* The only valid flag during creation is IE_SOFT. */ if ((flags & ~IE_SOFT) != 0) return (EINVAL); ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_pre_ithread = pre_ithread; ie->ie_post_ithread = post_ithread; ie->ie_post_filter = post_filter; ie->ie_assign_cpu = assign_cpu; ie->ie_flags = flags; ie->ie_irq = irq; ie->ie_cpu = NOCPU; CK_SLIST_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); va_start(ap, fmt); vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap); va_end(ap); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); mtx_lock(&event_lock); TAILQ_INSERT_TAIL(&event_list, ie, ie_list); mtx_unlock(&event_lock); if (event != NULL) *event = ie; CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name); return (0); } /* * Bind an interrupt event to the specified CPU. Note that not all * platforms support binding an interrupt to a CPU. For those * platforms this request will fail. Using a cpu id of NOCPU unbinds * the interrupt event. */ static int _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread) { lwpid_t id; int error; /* Need a CPU to bind to. */ if (cpu != NOCPU && CPU_ABSENT(cpu)) return (EINVAL); if (ie->ie_assign_cpu == NULL) return (EOPNOTSUPP); error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR); if (error) return (error); /* * If we have any ithreads try to set their mask first to verify * permissions, etc. */ if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_setithread(id, cpu); if (error) return (error); } else mtx_unlock(&ie->ie_lock); } if (bindirq) error = ie->ie_assign_cpu(ie->ie_source, cpu); if (error) { if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { cpu = ie->ie_cpu; id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); (void)cpuset_setithread(id, cpu); } else mtx_unlock(&ie->ie_lock); } return (error); } if (bindirq) { mtx_lock(&ie->ie_lock); ie->ie_cpu = cpu; mtx_unlock(&ie->ie_lock); } return (error); } /* * Bind an interrupt event to the specified CPU. For supported platforms, any * associated ithreads as well as the primary interrupt context will be bound * to the specificed CPU. */ int intr_event_bind(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, true)); } /* * Bind an interrupt event to the specified CPU, but do not bind associated * ithreads. */ int intr_event_bind_irqonly(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, false)); } /* * Bind an interrupt event's ithread to the specified CPU. */ int intr_event_bind_ithread(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, false, true)); } /* * Bind an interrupt event's ithread to the specified cpuset. */ int intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) { lwpid_t id; mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); return (cpuset_setthread(id, cs)); } else { mtx_unlock(&ie->ie_lock); } return (ENODEV); } static struct intr_event * intr_lookup(int irq) { struct intr_event *ie; mtx_lock(&event_lock); TAILQ_FOREACH(ie, &event_list, ie_list) if (ie->ie_irq == irq && (ie->ie_flags & IE_SOFT) == 0 && CK_SLIST_FIRST(&ie->ie_handlers) != NULL) break; mtx_unlock(&event_lock); return (ie); } int intr_setaffinity(int irq, int mode, void *m) { struct intr_event *ie; cpuset_t *mask; int cpu, n; mask = m; cpu = NOCPU; /* * If we're setting all cpus we can unbind. Otherwise make sure * only one cpu is in the set. */ if (CPU_CMP(cpuset_root, mask)) { for (n = 0; n < CPU_SETSIZE; n++) { if (!CPU_ISSET(n, mask)) continue; if (cpu != NOCPU) return (EINVAL); cpu = n; } } ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); switch (mode) { case CPU_WHICH_IRQ: return (intr_event_bind(ie, cpu)); case CPU_WHICH_INTRHANDLER: return (intr_event_bind_irqonly(ie, cpu)); case CPU_WHICH_ITHREAD: return (intr_event_bind_ithread(ie, cpu)); default: return (EINVAL); } } int intr_getaffinity(int irq, int mode, void *m) { struct intr_event *ie; struct thread *td; struct proc *p; cpuset_t *mask; lwpid_t id; int error; mask = m; ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); error = 0; CPU_ZERO(mask); switch (mode) { case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: mtx_lock(&ie->ie_lock); if (ie->ie_cpu == NOCPU) CPU_COPY(cpuset_root, mask); else CPU_SET(ie->ie_cpu, mask); mtx_unlock(&ie->ie_lock); break; case CPU_WHICH_ITHREAD: mtx_lock(&ie->ie_lock); if (ie->ie_thread == NULL) { mtx_unlock(&ie->ie_lock); CPU_COPY(cpuset_root, mask); } else { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL); if (error != 0) return (error); CPU_COPY(&td->td_cpuset->cs_mask, mask); PROC_UNLOCK(p); } default: return (EINVAL); } return (0); } int intr_event_destroy(struct intr_event *ie) { mtx_lock(&event_lock); mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); return (EBUSY); } TAILQ_REMOVE(&event_list, ie, ie_list); #ifndef notyet if (ie->ie_thread != NULL) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); mtx_destroy(&ie->ie_lock); free(ie, M_ITHREAD); return (0); } static struct intr_thread * ithread_create(const char *name) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ithd, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } static void ithread_destroy(struct intr_thread *ithread) { struct thread *td; CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_wakeup(td, SRQ_INTR); } else thread_unlock(td); } int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_handler **prevptr; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; if (flags & INTR_TYPE_NET) ih->ih_flags |= IH_NET; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (CK_SLIST_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* Create a thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn"); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } /* Add the new handler to the event in priority order. */ CK_SLIST_FOREACH_PREVPTR(temp_ih, prevptr, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } CK_SLIST_INSERT_PREVPTR(prevptr, temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } /* * Append a description preceded by a ':' to the name of the specified * interrupt handler. */ int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr) { struct intr_handler *ih; size_t space; char *start; mtx_lock(&ie->ie_lock); #ifdef INVARIANTS CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih == cookie) break; } if (ih == NULL) { mtx_unlock(&ie->ie_lock); panic("handler %p not found in interrupt event %p", cookie, ie); } #endif ih = cookie; /* * Look for an existing description by checking for an * existing ":". This assumes device names do not include * colons. If one is found, prepare to insert the new * description at that point. If one is not found, find the * end of the name to use as the insertion point. */ start = strchr(ih->ih_name, ':'); if (start == NULL) start = strchr(ih->ih_name, 0); /* * See if there is enough remaining room in the string for the * description + ":". The "- 1" leaves room for the trailing * '\0'. The "+ 1" accounts for the colon. */ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1; if (strlen(descr) + 1 > space) { mtx_unlock(&ie->ie_lock); return (ENOSPC); } /* Append a colon followed by the description. */ *start = ':'; strcpy(start + 1, descr); intr_event_update(ie); mtx_unlock(&ie->ie_lock); return (0); } /* * Return the ie_source field from the intr_event an intr_handler is * associated with. */ void * intr_handler_source(void *cookie) { struct intr_handler *ih; struct intr_event *ie; ih = (struct intr_handler *)cookie; if (ih == NULL) return (NULL); ie = ih->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", ih->ih_name)); return (ie->ie_source); } /* * If intr_event_handle() is running in the ISR context at the time of the call, * then wait for it to complete. */ static void intr_event_barrier(struct intr_event *ie) { int phase; mtx_assert(&ie->ie_lock, MA_OWNED); phase = ie->ie_phase; /* * Switch phase to direct future interrupts to the other active counter. * Make sure that any preceding stores are visible before the switch. */ KASSERT(ie->ie_active[!phase] == 0, ("idle phase has activity")); atomic_store_rel_int(&ie->ie_phase, !phase); /* * This code cooperates with wait-free iteration of ie_handlers * in intr_event_handle. * Make sure that the removal and the phase update are not reordered * with the active count check. * Note that no combination of acquire and release fences can provide * that guarantee as Store->Load sequences can always be reordered. */ atomic_thread_fence_seq_cst(); /* * Now wait on the inactive phase. * The acquire fence is needed so that that all post-barrier accesses * are after the check. */ while (ie->ie_active[phase] > 0) cpu_spinwait(); atomic_thread_fence_acq(); } static void intr_handler_barrier(struct intr_handler *handler) { struct intr_event *ie; ie = handler->ih_event; mtx_assert(&ie->ie_lock, MA_OWNED); KASSERT((handler->ih_flags & IH_DEAD) == 0, ("update for a removed handler")); if (ie->ie_thread == NULL) { intr_event_barrier(ie); return; } if ((handler->ih_flags & IH_CHANGED) == 0) { handler->ih_flags |= IH_CHANGED; intr_event_schedule_thread(ie, NULL); } while ((handler->ih_flags & IH_CHANGED) != 0) msleep(handler, &ie->ie_lock, 0, "ih_barr", 0); } /* * Sleep until an ithread finishes executing an interrupt handler. * * XXX Doesn't currently handle interrupt filters or fast interrupt * handlers. This is intended for LinuxKPI drivers only. * Do not use in BSD code. */ void _intr_drain(int irq) { struct intr_event *ie; struct intr_thread *ithd; struct thread *td; ie = intr_lookup(irq); if (ie == NULL) return; if (ie->ie_thread == NULL) return; ithd = ie->ie_thread; td = ithd->it_thread; /* * We set the flag and wait for it to be cleared to avoid * long delays with potentially busy interrupt handlers * were we to only sample TD_AWAITING_INTR() every tick. */ thread_lock(td); if (!TD_AWAITING_INTR(td)) { ithd->it_flags |= IT_WAIT; while (ithd->it_flags & IT_WAIT) { thread_unlock(td); pause("idrain", 1); thread_lock(td); } } thread_unlock(td); return; } int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; struct intr_handler *ih; struct intr_handler **prevptr; #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); CK_SLIST_FOREACH_PREVPTR(ih, prevptr, &ie->ie_handlers, ih_next) { if (ih == handler) break; } if (ih == NULL) { panic("interrupt handler \"%s\" not found in " "interrupt event \"%s\"", handler->ih_name, ie->ie_name); } /* * If there is no ithread, then directly remove the handler. Note that * intr_event_handle() iterates ie_handlers in a lock-less fashion, so * care needs to be taken to keep ie_handlers consistent and to free * the removed handler only when ie_handlers is quiescent. */ if (ie->ie_thread == NULL) { CK_SLIST_REMOVE_PREVPTR(prevptr, ih, ih_next); intr_event_barrier(ie); intr_event_update(ie); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* * Let the interrupt thread do the job. * The interrupt source is disabled when the interrupt thread is * running, so it does not have to worry about interaction with * intr_event_handle(). */ KASSERT((handler->ih_flags & IH_DEAD) == 0, ("duplicate handle remove")); handler->ih_flags |= IH_DEAD; intr_event_schedule_thread(ie, NULL); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_handler != NULL) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } int intr_event_suspend_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); handler->ih_flags |= IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } int intr_event_resume_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); /* * intr_handler_barrier() acts not only as a barrier, * it also allows to check for any pending interrupts. */ mtx_lock(&ie->ie_lock); handler->ih_flags &= ~IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } static int intr_event_schedule_thread(struct intr_event *ie, struct trapframe *frame) { struct intr_entropy entropy; struct intr_thread *it; struct thread *td; struct thread *ctd; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers) || ie->ie_thread == NULL) return (EINVAL); ctd = curthread; it = ie->ie_thread; td = it->it_thread; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_hflags & IH_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_INTERRUPT); } KASSERT(td->td_proc != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { #ifdef HWPMC_HOOKS it->it_waiting = 0; if (PMC_HOOK_INSTALLED_ANY()) PMC_SOFT_CALL_INTR_HLPR(schedule, frame); #endif CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, td->td_proc->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_wakeup(td, SRQ_INTR); } else { #ifdef HWPMC_HOOKS it->it_waiting++; if (PMC_HOOK_INSTALLED_ANY() && (it->it_waiting >= intr_hwpmc_waiting_report_threshold)) PMC_SOFT_CALL_INTR_HLPR(waiting, frame); #endif CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, td->td_proc->p_pid, td->td_name, it->it_need, TD_GET_STATE(td)); thread_unlock(td); } return (0); } /* * Allow interrupt event binding for software interrupt handlers -- a no-op, * since interrupts are generated in software rather than being directed by * a PIC. */ static int swi_assign_cpu(void *arg, int cpu) { return (0); } /* * Add a software interrupt handler to a specified event. If a given event * is not specified, then a new event is created. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct intr_event *ie; int error = 0; if (flags & INTR_ENTROPY) return (EINVAL); ie = (eventp != NULL) ? *eventp : NULL; if (ie != NULL) { if (!(ie->ie_flags & IE_SOFT)) return (EINVAL); } else { error = intr_event_create(&ie, NULL, IE_SOFT, 0, NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri); if (error) return (error); if (eventp != NULL) *eventp = ie; } if (handler != NULL) { error = intr_event_add_handler(ie, name, NULL, handler, arg, PI_SWI(pri), flags, cookiep); } return (error); } /* * Schedule a software interrupt thread. */ void swi_sched(void *cookie, int flags) { struct intr_handler *ih = (struct intr_handler *)cookie; struct intr_event *ie = ih->ih_event; struct intr_entropy entropy; int error __unused; CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name, ih->ih_need); if ((flags & SWI_FROMNMI) == 0) { entropy.event = (uintptr_t)ih; entropy.td = curthread; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_SWI); } /* * Set ih_need for this handler so that if the ithread is already * running it will execute this handler on the next pass. Otherwise, * it will execute it the next time it runs. */ ih->ih_need = 1; if (flags & SWI_DELAY) return; if (flags & SWI_FROMNMI) { #if defined(SMP) && (defined(__i386__) || defined(__amd64__)) KASSERT(ie == clk_intr_event, ("SWI_FROMNMI used not with clk_intr_event")); ipi_self_from_nmi(IPI_SWI); #endif } else { VM_CNT_INC(v_soft); error = intr_event_schedule_thread(ie, NULL); KASSERT(error == 0, ("stray software interrupt")); } } /* * Remove a software interrupt handler. Currently this code does not * remove the associated interrupt event if it becomes empty. Calling code * may do so manually via intr_event_destroy(), but that's not really * an optimal interface. */ int swi_remove(void *cookie) { return (intr_event_remove_handler(cookie)); } static void intr_event_execute_handlers(struct proc *p, struct intr_event *ie) { struct intr_handler *ih, *ihn, *ihp; ihp = NULL; CK_SLIST_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) { /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); if (ihp == NULL) CK_SLIST_REMOVE_HEAD(&ie->ie_handlers, ih_next); else CK_SLIST_REMOVE_AFTER(ihp, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); continue; } /* * Now that we know that the current element won't be removed * update the previous element. */ ihp = ih; if ((ih->ih_flags & IH_CHANGED) != 0) { mtx_lock(&ie->ie_lock); ih->ih_flags &= ~IH_CHANGED; wakeup(ih); mtx_unlock(&ie->ie_lock); } /* Skip filter only handlers */ if (ih->ih_handler == NULL) continue; /* Skip suspended handlers */ if ((ih->ih_flags & IH_SUSP) != 0) continue; /* * For software interrupt threads, we only execute * handlers that have their need flag set. Hardware * interrupt threads always invoke all of their handlers. * * ih_need can only be 0 or 1. Failed cmpset below * means that there is no request to execute handlers, * so a retry of the cmpset is not needed. */ if ((ie->ie_flags & IE_SOFT) != 0 && atomic_cmpset_int(&ih->ih_need, 1, 0) == 0) continue; /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } } static void ithread_execute_handlers(struct proc *p, struct intr_event *ie) { /* Interrupt handlers should not sleep. */ if (!(ie->ie_flags & IE_SOFT)) THREAD_NO_SLEEPING(); intr_event_execute_handlers(p, ie); if (!(ie->ie_flags & IE_SOFT)) THREAD_SLEEPING_OK(); /* * Interrupt storm handling: * * If this interrupt source is currently storming, then throttle * it to only fire the handler once per clock tick. * * If this interrupt source is not currently storming, but the * number of back to back interrupts exceeds the storm threshold, * then enter storming mode. */ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold && !(ie->ie_flags & IE_SOFT)) { /* Report the message only once every second. */ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) { printf( "interrupt storm detected on \"%s\"; throttling interrupt source\n", ie->ie_name); } pause("istorm", 1); } else ie->ie_count++; /* * Now that all the handlers have had a chance to run, reenable * the interrupt source. */ if (ie->ie_post_ithread != NULL) ie->ie_post_ithread(ie->ie_source); } /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct epoch_tracker et; struct intr_thread *ithd; struct intr_event *ie; struct thread *td; struct proc *p; int wake, epoch_count; bool needs_epoch; td = curthread; p = td->td_proc; ithd = (struct intr_thread *)arg; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ needs_epoch = (atomic_load_int(&ie->ie_hflags) & IH_NET) != 0; if (needs_epoch) { epoch_count = 0; NET_EPOCH_ENTER(et); } while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) { ithread_execute_handlers(p, ie); if (needs_epoch && ++epoch_count >= intr_epoch_batch) { NET_EPOCH_EXIT(et); epoch_count = 0; NET_EPOCH_ENTER(et); } } if (needs_epoch) NET_EPOCH_EXIT(et); WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT); } else { if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); } if (wake) { wakeup(ithd); wake = 0; } } } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_handler *ih; struct trapframe *oldframe; struct thread *td; int phase; int ret; bool filter, thread; td = curthread; #ifdef KSTACK_USAGE_PROF intr_prof_stack_use(td, frame); #endif /* An interrupt with no event or handlers is a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers)) return (EINVAL); /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ td->td_intr_nesting_level++; filter = false; thread = false; ret = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; phase = ie->ie_phase; atomic_add_int(&ie->ie_active[phase], 1); /* * This fence is required to ensure that no later loads are * re-ordered before the ie_active store. */ atomic_thread_fence_seq_cst(); CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if ((ih->ih_flags & IH_SUSP) != 0) continue; if ((ie->ie_flags & IE_SOFT) != 0 && ih->ih_need == 0) continue; if (ih->ih_filter == NULL) { thread = true; continue; } CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__, ih->ih_filter, ih->ih_argument == NULL ? frame : ih->ih_argument, ih->ih_name); if (ih->ih_argument == NULL) ret = ih->ih_filter(frame); else ret = ih->ih_filter(ih->ih_argument); #ifdef HWPMC_HOOKS PMC_SOFT_CALL_TF( , , intr, all, frame); #endif KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); filter = filter || ret == FILTER_HANDLED; #ifdef HWPMC_HOOKS if (ret & FILTER_SCHEDULE_THREAD) PMC_SOFT_CALL_TF( , , intr, ithread, frame); else if (ret & FILTER_HANDLED) PMC_SOFT_CALL_TF( , , intr, filter, frame); else if (ret == FILTER_STRAY) PMC_SOFT_CALL_TF( , , intr, stray, frame); #endif /* * Wrapper handler special handling: * * in some particular cases (like pccard and pccbb), * the _real_ device handler is wrapped in a couple of * functions - a filter wrapper and an ithread wrapper. * In this case (and just in this case), the filter wrapper * could ask the system to schedule the ithread and mask * the interrupt source if the wrapped handler is composed * of just an ithread handler. * * TODO: write a generic wrapper to avoid people rolling * their own. */ if (!thread) { if (ret == FILTER_SCHEDULE_THREAD) thread = true; } } atomic_add_rel_int(&ie->ie_active[phase], -1); td->td_intr_frame = oldframe; if (thread) { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } else { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } /* Schedule the ithread if needed. */ if (thread) { int error __unused; error = intr_event_schedule_thread(ie, frame); KASSERT(error == 0, ("bad stray interrupt")); } critical_exit(); td->td_intr_nesting_level--; #ifdef notyet /* The interrupt is not aknowledged by any filter and has no ithread. */ if (!thread && !filter) return (EINVAL); #endif return (0); } #ifdef DDB /* * Dump details about an interrupt handler */ static void db_dump_intrhand(struct intr_handler *ih) { int comma; db_printf("\t%-10s ", ih->ih_name); switch (ih->ih_pri) { case PI_REALTIME: db_printf("CLK "); break; case PI_INTR: db_printf("INTR"); break; default: if (ih->ih_pri >= PI_SOFT) db_printf("SWI "); else db_printf("%4u", ih->ih_pri); break; } db_printf(" "); if (ih->ih_filter != NULL) { db_printf("[F]"); db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC); } if (ih->ih_handler != NULL) { if (ih->ih_filter != NULL) db_printf(","); db_printf("[H]"); db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC); } db_printf("(%p)", ih->ih_argument); if (ih->ih_need || (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD | IH_MPSAFE)) != 0) { db_printf(" {"); comma = 0; if (ih->ih_flags & IH_EXCLUSIVE) { if (comma) db_printf(", "); db_printf("EXCL"); comma = 1; } if (ih->ih_flags & IH_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ih->ih_flags & IH_DEAD) { if (comma) db_printf(", "); db_printf("DEAD"); comma = 1; } if (ih->ih_flags & IH_MPSAFE) { if (comma) db_printf(", "); db_printf("MPSAFE"); comma = 1; } if (ih->ih_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); } /* * Dump details about a event. */ void db_dump_intr_event(struct intr_event *ie, int handlers) { struct intr_handler *ih; struct intr_thread *it; int comma; db_printf("%s ", ie->ie_fullname); it = ie->ie_thread; if (it != NULL) db_printf("(pid %d)", it->it_thread->td_proc->p_pid); else db_printf("(no thread)"); if ((ie->ie_flags & (IE_SOFT | IE_ADDING_THREAD)) != 0 || (it != NULL && it->it_need)) { db_printf(" {"); comma = 0; if (ie->ie_flags & IE_SOFT) { db_printf("SOFT"); comma = 1; } if (ie->ie_flags & IE_ADDING_THREAD) { if (comma) db_printf(", "); db_printf("ADDING_THREAD"); comma = 1; } if (it != NULL && it->it_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); if (handlers) CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) db_dump_intrhand(ih); } /* * Dump data about interrupt handlers */ -DB_SHOW_COMMAND(intr, db_show_intr) +DB_SHOW_COMMAND_FLAGS(intr, db_show_intr, DB_CMD_MEMSAFE) { struct intr_event *ie; int all, verbose; verbose = strchr(modif, 'v') != NULL; all = strchr(modif, 'a') != NULL; TAILQ_FOREACH(ie, &event_list, ie_list) { if (!all && CK_SLIST_EMPTY(&ie->ie_handlers)) continue; db_dump_intr_event(ie, verbose); if (db_pager_quit) break; } } #endif /* DDB */ /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(&clk_intr_event, "clk", NULL, NULL, SWI_CLOCK, INTR_MPSAFE, NULL)) panic("died while creating clk swi ithread"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL); /* * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. * The data for this machine dependent, and the declarations are in machine * dependent code. The layout of intrnames and intrcnt however is machine * independent. * * We do not know the length of intrcnt and intrnames at compile time, so * calculate things at run time. */ static int sysctl_intrnames(SYSCTL_HANDLER_ARGS) { return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_intrnames, "", "Interrupt Names"); static int sysctl_intrcnt(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 uint32_t *intrcnt32; unsigned i; int error; if (req->flags & SCTL_MASK32) { if (!req->oldptr) return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req)); intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT); if (intrcnt32 == NULL) return (ENOMEM); for (i = 0; i < sintrcnt / sizeof (u_long); i++) intrcnt32[i] = intrcnt[i]; error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req); free(intrcnt32, M_TEMP); return (error); } #endif return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); #ifdef DDB /* * DDB command to dump the interrupt statistics. */ -DB_SHOW_COMMAND(intrcnt, db_show_intrcnt) +DB_SHOW_COMMAND_FLAGS(intrcnt, db_show_intrcnt, DB_CMD_MEMSAFE) { u_long *i; char *cp; u_int j; cp = intrnames; j = 0; for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit; i++, j++) { if (*cp == '\0') break; if (*i != 0) db_printf("%s\t%lu\n", cp, *i); cp += strlen(cp) + 1; } } #endif diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c index 79e4b6121054..fe25122ecb9e 100644 --- a/sys/kern/kern_ktr.c +++ b/sys/kern/kern_ktr.c @@ -1,482 +1,482 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables used by KTR and the ktr_tracepoint() * function that does the actual tracing. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktr.h" #include "opt_alq.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifndef KTR_BOOT_ENTRIES #define KTR_BOOT_ENTRIES 1024 #endif #ifndef KTR_ENTRIES #define KTR_ENTRIES 1024 #endif /* Limit the allocations to something manageable. */ #define KTR_ENTRIES_MAX (8 * 1024 * 1024) #ifndef KTR_MASK #define KTR_MASK (0) #endif #ifndef KTR_CPUMASK #define KTR_CPUMASK CPUSET_FSET #endif #ifndef KTR_TIME #define KTR_TIME get_cyclecount() #endif #ifndef KTR_CPU #define KTR_CPU PCPU_GET(cpuid) #endif static MALLOC_DEFINE(M_KTR, "KTR", "KTR"); FEATURE(ktr, "Kernel support for KTR kernel tracing facility"); volatile int ktr_idx = 0; uint64_t ktr_mask = KTR_MASK; uint64_t ktr_compile = KTR_COMPILE; int ktr_entries = KTR_BOOT_ENTRIES; int ktr_version = KTR_VERSION; struct ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES]; struct ktr_entry *ktr_buf = ktr_buf_init; cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK); static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "KTR options"); SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD, &ktr_version, 0, "Version of the KTR interface"); SYSCTL_UQUAD(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD, &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel"); static int sysctl_debug_ktr_cpumask(SYSCTL_HANDLER_ARGS) { char lktr_cpumask_str[CPUSETBUFSIZ]; cpuset_t imask; int error; cpusetobj_strprint(lktr_cpumask_str, &ktr_cpumask); error = sysctl_handle_string(oidp, lktr_cpumask_str, sizeof(lktr_cpumask_str), req); if (error != 0 || req->newptr == NULL) return (error); if (cpusetobj_strscan(&imask, lktr_cpumask_str) == -1) return (EINVAL); CPU_COPY(&imask, &ktr_cpumask); return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, cpumask, CTLFLAG_RWTUN | CTLFLAG_MPSAFE | CTLTYPE_STRING, NULL, 0, sysctl_debug_ktr_cpumask, "S", "Bitmask of CPUs on which KTR logging is enabled"); static int sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries); ktr_idx = 0; } return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_debug_ktr_clear, "I", "Clear KTR Buffer"); /* * This is a sysctl proc so that it is serialized as !MPSAFE along with * the other ktr sysctl procs. */ static int sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS) { uint64_t mask; int error; mask = ktr_mask; error = sysctl_handle_64(oidp, &mask, 0, req); if (error || !req->newptr) return (error); ktr_mask = mask; return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, 0, sysctl_debug_ktr_mask, "QU", "Bitmask of KTR event classes for which logging is enabled"); #if KTR_ENTRIES > KTR_BOOT_ENTRIES /* * A simplified version of sysctl_debug_ktr_entries. * No need to care about SMP, scheduling, etc. */ static void ktr_entries_initializer(void *dummy __unused) { uint64_t mask; /* Temporarily disable ktr in case malloc() is being traced. */ mask = ktr_mask; ktr_mask = 0; ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR, M_WAITOK | M_ZERO); memcpy(ktr_buf, ktr_buf_init + ktr_idx, (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf)); if (ktr_idx != 0) { memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init, ktr_idx * sizeof(*ktr_buf)); ktr_idx = KTR_BOOT_ENTRIES; } ktr_entries = KTR_ENTRIES; ktr_mask = mask; } SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY, ktr_entries_initializer, NULL); #endif static int sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS) { uint64_t mask; int entries, error; struct ktr_entry *buf, *oldbuf; entries = ktr_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error || !req->newptr) return (error); if (entries > KTR_ENTRIES_MAX) return (ERANGE); /* Disable ktr temporarily. */ mask = ktr_mask; ktr_mask = 0; /* Wait for threads to go idle. */ if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) { ktr_mask = mask; return (error); } if (ktr_buf != ktr_buf_init) oldbuf = ktr_buf; else oldbuf = NULL; /* Allocate a new buffer. */ buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO); /* Install the new buffer and restart ktr. */ ktr_buf = buf; ktr_entries = entries; ktr_idx = 0; ktr_mask = mask; if (oldbuf != NULL) free(oldbuf, M_KTR); return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer"); #ifdef KTR_VERBOSE int ktr_verbose = KTR_VERBOSE; TUNABLE_INT("debug.ktr.verbose", &ktr_verbose); SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, ""); #endif #ifdef KTR_ALQ struct alq *ktr_alq; char ktr_alq_file[MAXPATHLEN] = "/tmp/ktr.out"; int ktr_alq_cnt = 0; int ktr_alq_depth = KTR_ENTRIES; int ktr_alq_enabled = 0; int ktr_alq_failed = 0; int ktr_alq_max = 0; SYSCTL_INT(_debug_ktr, OID_AUTO, alq_max, CTLFLAG_RW, &ktr_alq_max, 0, "Maximum number of entries to write"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_cnt, CTLFLAG_RD, &ktr_alq_cnt, 0, "Current number of written entries"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_failed, CTLFLAG_RD, &ktr_alq_failed, 0, "Number of times we overran the buffer"); SYSCTL_INT(_debug_ktr, OID_AUTO, alq_depth, CTLFLAG_RW, &ktr_alq_depth, 0, "Number of items in the write buffer"); SYSCTL_STRING(_debug_ktr, OID_AUTO, alq_file, CTLFLAG_RW, ktr_alq_file, sizeof(ktr_alq_file), "KTR logging file"); static int sysctl_debug_ktr_alq_enable(SYSCTL_HANDLER_ARGS) { int error; int enable; enable = ktr_alq_enabled; error = sysctl_handle_int(oidp, &enable, 0, req); if (error || !req->newptr) return (error); if (enable) { if (ktr_alq_enabled) return (0); error = alq_open(&ktr_alq, (const char *)ktr_alq_file, req->td->td_ucred, ALQ_DEFAULT_CMODE, sizeof(struct ktr_entry), ktr_alq_depth); if (error == 0) { ktr_alq_cnt = 0; ktr_alq_failed = 0; ktr_alq_enabled = 1; } } else { if (ktr_alq_enabled == 0) return (0); ktr_alq_enabled = 0; alq_close(ktr_alq); ktr_alq = NULL; } return (error); } SYSCTL_PROC(_debug_ktr, OID_AUTO, alq_enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_debug_ktr_alq_enable, "I", "Enable KTR logging"); #endif void ktr_tracepoint(uint64_t mask, const char *file, int line, const char *format, u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5, u_long arg6) { struct ktr_entry *entry; #ifdef KTR_ALQ struct ale *ale = NULL; #endif int newindex, saveindex; #if defined(KTR_VERBOSE) || defined(KTR_ALQ) struct thread *td; #endif int cpu; if (KERNEL_PANICKED() || kdb_active) return; if ((ktr_mask & mask) == 0 || ktr_buf == NULL) return; cpu = KTR_CPU; if (!CPU_ISSET(cpu, &ktr_cpumask)) return; #if defined(KTR_VERBOSE) || defined(KTR_ALQ) td = curthread; if (td->td_pflags & TDP_INKTR) return; td->td_pflags |= TDP_INKTR; #endif #ifdef KTR_ALQ if (ktr_alq_enabled) { if (td->td_critnest == 0 && (TD_IS_IDLETHREAD(td)) == 0 && td != ald_thread) { if (ktr_alq_max && ktr_alq_cnt > ktr_alq_max) goto done; if ((ale = alq_get(ktr_alq, ALQ_NOWAIT)) == NULL) { ktr_alq_failed++; goto done; } ktr_alq_cnt++; entry = (struct ktr_entry *)ale->ae_data; } else { goto done; } } else #endif { do { saveindex = ktr_idx; newindex = (saveindex + 1) % ktr_entries; } while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0); entry = &ktr_buf[saveindex]; } entry->ktr_timestamp = KTR_TIME; entry->ktr_cpu = cpu; entry->ktr_thread = curthread; if (file != NULL) while (strncmp(file, "../", 3) == 0) file += 3; entry->ktr_file = file; entry->ktr_line = line; #ifdef KTR_VERBOSE if (ktr_verbose) { #ifdef SMP printf("cpu%d ", cpu); #endif if (ktr_verbose > 1) { printf("%s.%d\t", entry->ktr_file, entry->ktr_line); } printf(format, arg1, arg2, arg3, arg4, arg5, arg6); printf("\n"); } #endif entry->ktr_desc = format; entry->ktr_parms[0] = arg1; entry->ktr_parms[1] = arg2; entry->ktr_parms[2] = arg3; entry->ktr_parms[3] = arg4; entry->ktr_parms[4] = arg5; entry->ktr_parms[5] = arg6; #ifdef KTR_ALQ if (ktr_alq_enabled && ale) alq_post(ktr_alq, ale); done: #endif #if defined(KTR_VERBOSE) || defined(KTR_ALQ) td->td_pflags &= ~TDP_INKTR; #endif } #ifdef DDB struct tstate { int cur; int first; }; static struct tstate tstate; static int db_ktr_verbose; static int db_mach_vtrace(void); -DB_SHOW_COMMAND(ktr, db_ktr_all) +DB_SHOW_COMMAND_FLAGS(ktr, db_ktr_all, DB_CMD_MEMSAFE) { tstate.cur = (ktr_idx - 1) % ktr_entries; tstate.first = -1; db_ktr_verbose = 0; db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0; db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestamp please */ if (strchr(modif, 'a') != NULL) { db_disable_pager(); while (cncheckc() == -1) if (db_mach_vtrace() == 0) break; } else { while (!db_pager_quit) if (db_mach_vtrace() == 0) break; } } static int db_mach_vtrace(void) { struct ktr_entry *kp; if (tstate.cur == tstate.first || ktr_buf == NULL) { db_printf("--- End of trace buffer ---\n"); return (0); } kp = &ktr_buf[tstate.cur]; /* Skip over unused entries. */ if (kp->ktr_desc == NULL) { db_printf("--- End of trace buffer ---\n"); return (0); } db_printf("%d (%p", tstate.cur, kp->ktr_thread); #ifdef SMP db_printf(":cpu%d", kp->ktr_cpu); #endif db_printf(")"); if (db_ktr_verbose >= 1) { db_printf(" %10.10lld", (long long)kp->ktr_timestamp); } if (db_ktr_verbose >= 2) { db_printf(" %s.%d", kp->ktr_file, kp->ktr_line); } db_printf(": "); db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1], kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4], kp->ktr_parms[5]); db_printf("\n"); if (tstate.first == -1) tstate.first = tstate.cur; if (--tstate.cur < 0) tstate.cur = ktr_entries - 1; return (1); } #endif /* DDB */ diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index 2d3a1be54566..9756bc571dca 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -1,2352 +1,2352 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include "linker_if.h" #ifdef HWPMC_HOOKS #include #endif #ifdef KLD_DEBUG int kld_debug = 0; SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN, &kld_debug, 0, "Set various levels of KLD debug"); #endif /* These variables are used by kernel debuggers to enumerate loaded files. */ const int kld_off_address = offsetof(struct linker_file, address); const int kld_off_filename = offsetof(struct linker_file, filename); const int kld_off_pathname = offsetof(struct linker_file, pathname); const int kld_off_next = offsetof(struct linker_file, link.tqe_next); /* * static char *linker_search_path(const char *name, struct mod_depend * *verinfo); */ static const char *linker_basename(const char *path); /* * Find a currently loaded file given its filename. */ static linker_file_t linker_find_file_by_name(const char* _filename); /* * Find a currently loaded file given its file id. */ static linker_file_t linker_find_file_by_id(int _fileid); /* Metadata from the static kernel */ SET_DECLARE(modmetadata_set, struct mod_metadata); MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); linker_file_t linker_kernel_file; static struct sx kld_sx; /* kernel linker lock */ static u_int kld_busy; static struct thread *kld_busy_owner; /* * Load counter used by clients to determine if a linker file has been * re-loaded. This counter is incremented for each file load. */ static int loadcnt; static linker_class_list_t classes; static linker_file_list_t linker_files; static int next_file_id = 1; static int linker_no_more_classes = 0; #define LINKER_GET_NEXT_FILE_ID(a) do { \ linker_file_t lftmp; \ \ if (!cold) \ sx_assert(&kld_sx, SA_XLOCKED); \ retry: \ TAILQ_FOREACH(lftmp, &linker_files, link) { \ if (next_file_id == lftmp->id) { \ next_file_id++; \ goto retry; \ } \ } \ (a) = next_file_id; \ } while (0) /* XXX wrong name; we're looking at version provision tags here, not modules */ typedef TAILQ_HEAD(, modlist) modlisthead_t; struct modlist { TAILQ_ENTRY(modlist) link; /* chain together all modules */ linker_file_t container; const char *name; int version; }; typedef struct modlist *modlist_t; static modlisthead_t found_modules; static int linker_file_add_dependency(linker_file_t file, linker_file_t dep); static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char* name, int deps); static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp); static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo); static void linker_init(void *arg) { sx_init(&kld_sx, "kernel linker"); TAILQ_INIT(&classes); TAILQ_INIT(&linker_files); } SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, NULL); static void linker_stop_class_add(void *arg) { linker_no_more_classes = 1; } SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL); int linker_add_class(linker_class_t lc) { /* * We disallow any class registration past SI_ORDER_ANY * of SI_SUB_KLD. We bump the reference count to keep the * ops from being freed. */ if (linker_no_more_classes == 1) return (EPERM); kobj_class_compile((kobj_class_t) lc); ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */ TAILQ_INSERT_TAIL(&classes, lc, link); return (0); } static void linker_file_sysinit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; int last; KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) return; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ last = SI_SUB_DUMMY; sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ if ((*sipp)->subsystem > last) BOOTTRACE("%s: sysinit 0x%7x", lf->filename, (*sipp)->subsystem); /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); last = (*sipp)->subsystem; } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_sysuninit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; int last; KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, NULL) != 0) return; /* * Perform a reverse bubble sort of the system initialization objects * by their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem > (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order >= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); last = SI_SUB_DUMMY; for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ if ((*sipp)->subsystem > last) BOOTTRACE("%s: sysuninit 0x%7x", lf->filename, (*sipp)->subsystem); /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); last = (*sipp)->subsystem; } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_register_sysctls(linker_file_t lf, bool enable) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_register_sysctls: registering SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) { if (enable) sysctl_register_oid(*oidp); else sysctl_register_disabled_oid(*oidp); } sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_enable_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_enable_sysctls: enable SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_enable_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_unregister_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs" " for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_unregister_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static int linker_file_register_modules(linker_file_t lf) { struct mod_metadata **start, **stop, **mdp; const moduledata_t *moddata; int first_error, error; KLD_DPF(FILE, ("linker_file_register_modules: registering modules" " in %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL) != 0) { /* * This fallback should be unnecessary, but if we get booted * from boot2 instead of loader and we are missing our * metadata then we have to try the best we can. */ if (lf == linker_kernel_file) { start = SET_BEGIN(modmetadata_set); stop = SET_LIMIT(modmetadata_set); } else return (0); } first_error = 0; for (mdp = start; mdp < stop; mdp++) { if ((*mdp)->md_type != MDT_MODULE) continue; moddata = (*mdp)->md_data; KLD_DPF(FILE, ("Registering module %s in %s\n", moddata->name, lf->filename)); error = module_register(moddata, lf); if (error) { printf("Module %s failed to register: %d\n", moddata->name, error); if (first_error == 0) first_error = error; } } return (first_error); } static void linker_init_kernel_modules(void) { sx_xlock(&kld_sx); linker_file_register_modules(linker_kernel_file); sx_xunlock(&kld_sx); } SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, NULL); static int linker_load_file(const char *filename, linker_file_t *result) { linker_class_t lc; linker_file_t lf; int foundfile, error, modules; /* Refuse to load modules if securelevel raised */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); lf = linker_find_file_by_name(filename); if (lf) { KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," " incrementing refs\n", filename)); *result = lf; lf->refs++; return (0); } foundfile = 0; error = 0; /* * We do not need to protect (lock) classes here because there is * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) * and there is no class deregistration mechanism at this time. */ TAILQ_FOREACH(lc, &classes, link) { KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", filename)); error = LINKER_LOAD_FILE(lc, filename, &lf); /* * If we got something other than ENOENT, then it exists but * we cannot load it for some other reason. */ if (error != ENOENT) foundfile = 1; if (lf) { error = linker_file_register_modules(lf); if (error == EEXIST) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } modules = !TAILQ_EMPTY(&lf->modules); linker_file_register_sysctls(lf, false); linker_file_sysinit(lf); lf->flags |= LINKER_FILE_LINKED; /* * If all of the modules in this file failed * to load, unload the file and return an * error of ENOEXEC. */ if (modules && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (ENOEXEC); } linker_file_enable_sysctls(lf); EVENTHANDLER_INVOKE(kld_load, lf); *result = lf; return (0); } } /* * Less than ideal, but tells the user whether it failed to load or * the module was not found. */ if (foundfile) { /* * If the file type has not been recognized by the last try * printout a message before to fail. */ if (error == ENOSYS) printf("%s: %s - unsupported file type\n", __func__, filename); /* * Format not recognized or otherwise unloadable. * When loading a module that is statically built into * the kernel EEXIST percolates back up as the return * value. Preserve this so that apps like sysinstall * can recognize this special case and not post bogus * dialog boxes. */ if (error != EEXIST) error = ENOEXEC; } else error = ENOENT; /* Nothing found */ return (error); } int linker_reference_module(const char *modname, struct mod_depend *verinfo, linker_file_t *result) { modlist_t mod; int error; sx_xlock(&kld_sx); if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { *result = mod->container; (*result)->refs++; sx_xunlock(&kld_sx); return (0); } error = linker_load_module(NULL, modname, NULL, verinfo, result); sx_xunlock(&kld_sx); return (error); } int linker_release_module(const char *modname, struct mod_depend *verinfo, linker_file_t lf) { modlist_t mod; int error; sx_xlock(&kld_sx); if (lf == NULL) { KASSERT(modname != NULL, ("linker_release_module: no file or name")); mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { sx_xunlock(&kld_sx); return (ESRCH); } lf = mod->container; } else KASSERT(modname == NULL && verinfo == NULL, ("linker_release_module: both file and name")); error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL); sx_xunlock(&kld_sx); return (error); } static linker_file_t linker_find_file_by_name(const char *filename) { linker_file_t lf; char *koname; koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); sprintf(koname, "%s.ko", filename); sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) { if (strcmp(lf->filename, koname) == 0) break; if (strcmp(lf->filename, filename) == 0) break; } free(koname, M_LINKER); return (lf); } static linker_file_t linker_find_file_by_id(int fileid) { linker_file_t lf; sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED) break; return (lf); } int linker_file_foreach(linker_predicate_t *predicate, void *context) { linker_file_t lf; int retval = 0; sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { retval = predicate(lf, context); if (retval != 0) break; } sx_xunlock(&kld_sx); return (retval); } linker_file_t linker_make_file(const char *pathname, linker_class_t lc) { linker_file_t lf; const char *filename; if (!cold) sx_assert(&kld_sx, SA_XLOCKED); filename = linker_basename(pathname); KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname)); lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); if (lf == NULL) return (NULL); lf->ctors_addr = 0; lf->ctors_size = 0; lf->dtors_addr = 0; lf->dtors_size = 0; lf->refs = 1; lf->userrefs = 0; lf->flags = 0; lf->filename = strdup(filename, M_LINKER); lf->pathname = strdup(pathname, M_LINKER); LINKER_GET_NEXT_FILE_ID(lf->id); lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; #ifdef __arm__ lf->exidx_addr = 0; lf->exidx_size = 0; #endif STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); return (lf); } int linker_file_unload(linker_file_t file, int flags) { module_t mod, next; modlist_t ml, nextml; struct common_symbol *cp; int error, i; /* Refuse to unload modules if securelevel raised. */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); /* Easy case of just dropping a reference. */ if (file->refs > 1) { file->refs--; return (0); } /* Give eventhandlers a chance to prevent the unload. */ error = 0; EVENTHANDLER_INVOKE(kld_unload_try, file, &error); if (error != 0) return (EBUSY); KLD_DPF(FILE, ("linker_file_unload: file is unloading," " informing modules\n")); /* * Quiesce all the modules to give them a chance to veto the unload. */ MOD_SLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = module_getfnext(mod)) { error = module_quiesce(mod); if (error != 0 && flags != LINKER_UNLOAD_FORCE) { KLD_DPF(FILE, ("linker_file_unload: module %s" " vetoed unload\n", module_getname(mod))); /* * XXX: Do we need to tell all the quiesced modules * that they can resume work now via a new module * event? */ MOD_SUNLOCK; return (error); } } MOD_SUNLOCK; /* * Inform any modules associated with this file that they are * being unloaded. */ MOD_XLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { next = module_getfnext(mod); MOD_XUNLOCK; /* * Give the module a chance to veto the unload. */ if ((error = module_unload(mod)) != 0) { #ifdef KLD_DEBUG MOD_SLOCK; KLD_DPF(FILE, ("linker_file_unload: module %s" " failed unload\n", module_getname(mod))); MOD_SUNLOCK; #endif return (error); } MOD_XLOCK; module_release(mod); } MOD_XUNLOCK; TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) { if (ml->container == file) { TAILQ_REMOVE(&found_modules, ml, link); free(ml, M_LINKER); } } /* * Don't try to run SYSUNINITs if we are unloaded due to a * link error. */ if (file->flags & LINKER_FILE_LINKED) { file->flags &= ~LINKER_FILE_LINKED; linker_file_unregister_sysctls(file); linker_file_sysuninit(file); } TAILQ_REMOVE(&linker_files, file, link); if (file->deps) { for (i = 0; i < file->ndeps; i++) linker_file_unload(file->deps[i], flags); free(file->deps, M_LINKER); file->deps = NULL; } while ((cp = STAILQ_FIRST(&file->common)) != NULL) { STAILQ_REMOVE_HEAD(&file->common, link); free(cp, M_LINKER); } LINKER_UNLOAD(file); EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address, file->size); if (file->filename) { free(file->filename, M_LINKER); file->filename = NULL; } if (file->pathname) { free(file->pathname, M_LINKER); file->pathname = NULL; } kobj_delete((kobj_t) file, M_LINKER); return (0); } int linker_ctf_get(linker_file_t file, linker_ctf_t *lc) { return (LINKER_CTF_GET(file, lc)); } static int linker_file_add_dependency(linker_file_t file, linker_file_t dep) { linker_file_t *newdeps; sx_assert(&kld_sx, SA_XLOCKED); file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps), M_LINKER, M_WAITOK | M_ZERO); file->deps[file->ndeps] = dep; file->ndeps++; KLD_DPF(FILE, ("linker_file_add_dependency:" " adding %s as dependency for %s\n", dep->filename, file->filename)); return (0); } /* * Locate a linker set and its contents. This is a helper function to avoid * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **. * This function is used in this file so we can avoid having lots of (void **) * casts. */ int linker_file_lookup_set(linker_file_t file, const char *name, void *firstp, void *lastp, int *countp) { sx_assert(&kld_sx, SA_LOCKED); return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); } /* * List all functions in a file. */ int linker_file_function_listall(linker_file_t lf, linker_function_nameval_callback_t callback_func, void *arg) { return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg)); } caddr_t linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) { caddr_t sym; int locked; locked = sx_xlocked(&kld_sx); if (!locked) sx_xlock(&kld_sx); sym = linker_file_lookup_symbol_internal(file, name, deps); if (!locked) sx_xunlock(&kld_sx); return (sym); } static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char *name, int deps) { c_linker_sym_t sym; linker_symval_t symval; caddr_t address; size_t common_size = 0; int i; sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n", file, name, deps)); if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { LINKER_SYMBOL_VALUES(file, sym, &symval); if (symval.value == 0) /* * For commons, first look them up in the * dependencies and only allocate space if not found * there. */ common_size = symval.size; else { KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" ".value=%p\n", symval.value)); return (symval.value); } } if (deps) { for (i = 0; i < file->ndeps; i++) { address = linker_file_lookup_symbol_internal( file->deps[i], name, 0); if (address) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " deps value=%p\n", address)); return (address); } } } if (common_size > 0) { /* * This is a common symbol which was not found in the * dependencies. We maintain a simple common symbol table in * the file object. */ struct common_symbol *cp; STAILQ_FOREACH(cp, &file->common, link) { if (strcmp(cp->name, name) == 0) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " old common value=%p\n", cp->address)); return (cp->address); } } /* * Round the symbol size up to align. */ common_size = (common_size + sizeof(int) - 1) & -sizeof(int); cp = malloc(sizeof(struct common_symbol) + common_size + strlen(name) + 1, M_LINKER, M_WAITOK | M_ZERO); cp->address = (caddr_t)(cp + 1); cp->name = cp->address + common_size; strcpy(cp->name, name); bzero(cp->address, common_size); STAILQ_INSERT_TAIL(&file->common, cp, link); KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" " value=%p\n", cp->address)); return (cp->address); } KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); return (0); } /* * Both DDB and stack(9) rely on the kernel linker to provide forward and * backward lookup of symbols. However, DDB and sometimes stack(9) need to * do this in a lockfree manner. We provide a set of internal helper * routines to perform these operations without locks, and then wrappers that * optionally lock. * * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB. */ #ifdef DDB static int linker_debug_lookup(const char *symstr, c_linker_sym_t *sym) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_DEBUG_SYMBOL(lf, symstr, sym) == 0) return (0); } return (ENOENT); } #endif static int linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { linker_file_t lf; c_linker_sym_t best, es; u_long diff, bestdiff, off; best = 0; off = (uintptr_t)value; bestdiff = off; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) continue; if (es != 0 && diff < bestdiff) { best = es; bestdiff = diff; } if (bestdiff == 0) break; } if (best) { *sym = best; *diffp = bestdiff; return (0); } else { *sym = 0; *diffp = off; return (ENOENT); } } static int linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_DEBUG_SYMBOL_VALUES(lf, sym, symval) == 0) return (0); } return (ENOENT); } static int linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { linker_symval_t symval; c_linker_sym_t sym; int error; *offset = 0; error = linker_debug_search_symbol(value, &sym, offset); if (error) return (error); error = linker_debug_symbol_values(sym, &symval); if (error) return (error); strlcpy(buf, symval.name, buflen); return (0); } /* * DDB Helpers. DDB has to look across multiple files with their own symbol * tables and string tables. * * Note that we do not obey list locking protocols here. We really don't need * DDB to hang because somebody's got the lock held. We'll take the chance * that the files list is inconsistent instead. */ #ifdef DDB int linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) { return (linker_debug_lookup(symstr, sym)); } #endif int linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { return (linker_debug_search_symbol(value, sym, diffp)); } int linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { return (linker_debug_symbol_values(sym, symval)); } int linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_debug_search_symbol_name(value, buf, buflen, offset)); } /* * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do * obey locking protocols, and offer a significantly less complex interface. */ int linker_search_symbol_name_flags(caddr_t value, char *buf, u_int buflen, long *offset, int flags) { int error; KASSERT((flags & (M_NOWAIT | M_WAITOK)) != 0 && (flags & (M_NOWAIT | M_WAITOK)) != (M_NOWAIT | M_WAITOK), ("%s: bad flags: 0x%x", __func__, flags)); if (flags & M_NOWAIT) { if (!sx_try_slock(&kld_sx)) return (EWOULDBLOCK); } else sx_slock(&kld_sx); error = linker_debug_search_symbol_name(value, buf, buflen, offset); sx_sunlock(&kld_sx); return (error); } int linker_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_search_symbol_name_flags(value, buf, buflen, offset, M_WAITOK)); } int linker_kldload_busy(int flags) { int error; MPASS((flags & ~(LINKER_UB_UNLOCK | LINKER_UB_LOCKED | LINKER_UB_PCATCH)) == 0); if ((flags & LINKER_UB_LOCKED) != 0) sx_assert(&kld_sx, SA_XLOCKED); if ((flags & LINKER_UB_LOCKED) == 0) sx_xlock(&kld_sx); while (kld_busy > 0) { if (kld_busy_owner == curthread) break; error = sx_sleep(&kld_busy, &kld_sx, (flags & LINKER_UB_PCATCH) != 0 ? PCATCH : 0, "kldbusy", 0); if (error != 0) { if ((flags & LINKER_UB_UNLOCK) != 0) sx_xunlock(&kld_sx); return (error); } } kld_busy++; kld_busy_owner = curthread; if ((flags & LINKER_UB_UNLOCK) != 0) sx_xunlock(&kld_sx); return (0); } void linker_kldload_unbusy(int flags) { MPASS((flags & ~LINKER_UB_LOCKED) == 0); if ((flags & LINKER_UB_LOCKED) != 0) sx_assert(&kld_sx, SA_XLOCKED); if ((flags & LINKER_UB_LOCKED) == 0) sx_xlock(&kld_sx); MPASS(kld_busy > 0); if (kld_busy_owner != curthread) panic("linker_kldload_unbusy done by not owning thread %p", kld_busy_owner); kld_busy--; if (kld_busy == 0) { kld_busy_owner = NULL; wakeup(&kld_busy); } sx_xunlock(&kld_sx); } /* * Syscalls. */ int kern_kldload(struct thread *td, const char *file, int *fileid) { const char *kldname, *modname; linker_file_t lf; int error; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. */ if (strchr(file, '/') || strchr(file, '.')) { kldname = file; modname = NULL; } else { kldname = NULL; modname = file; } error = linker_kldload_busy(LINKER_UB_PCATCH); if (error != 0) { sx_xunlock(&kld_sx); return (error); } /* * It is possible that kldloaded module will attach a new ifnet, * so vnet context must be set when this ocurs. */ CURVNET_SET(TD_TO_VNET(td)); error = linker_load_module(kldname, modname, NULL, NULL, &lf); CURVNET_RESTORE(); if (error == 0) { lf->userrefs++; if (fileid != NULL) *fileid = lf->id; } linker_kldload_unbusy(LINKER_UB_LOCKED); return (error); } int sys_kldload(struct thread *td, struct kldload_args *uap) { char *pathname = NULL; int error, fileid; td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL); if (error == 0) { error = kern_kldload(td, pathname, &fileid); if (error == 0) td->td_retval[0] = fileid; } free(pathname, M_TEMP); return (error); } int kern_kldunload(struct thread *td, int fileid, int flags) { linker_file_t lf; int error = 0; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); error = linker_kldload_busy(LINKER_UB_PCATCH); if (error != 0) { sx_xunlock(&kld_sx); return (error); } CURVNET_SET(TD_TO_VNET(td)); lf = linker_find_file_by_id(fileid); if (lf) { KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); if (lf->userrefs == 0) { /* * XXX: maybe LINKER_UNLOAD_FORCE should override ? */ printf("kldunload: attempt to unload file that was" " loaded by the kernel\n"); error = EBUSY; } else { lf->userrefs--; error = linker_file_unload(lf, flags); if (error) lf->userrefs++; } } else error = ENOENT; CURVNET_RESTORE(); linker_kldload_unbusy(LINKER_UB_LOCKED); return (error); } int sys_kldunload(struct thread *td, struct kldunload_args *uap) { return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL)); } int sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap) { if (uap->flags != LINKER_UNLOAD_NORMAL && uap->flags != LINKER_UNLOAD_FORCE) return (EINVAL); return (kern_kldunload(td, uap->fileid, uap->flags)); } int sys_kldfind(struct thread *td, struct kldfind_args *uap) { char *pathname; const char *filename; linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0) goto out; filename = linker_basename(pathname); sx_xlock(&kld_sx); lf = linker_find_file_by_name(filename); if (lf) td->td_retval[0] = lf->id; else error = ENOENT; sx_xunlock(&kld_sx); out: free(pathname, M_TEMP); return (error); } int sys_kldnext(struct thread *td, struct kldnext_args *uap) { linker_file_t lf; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); if (uap->fileid == 0) lf = TAILQ_FIRST(&linker_files); else { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) { error = ENOENT; goto out; } lf = TAILQ_NEXT(lf, link); } /* Skip partially loaded files. */ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED)) lf = TAILQ_NEXT(lf, link); if (lf) td->td_retval[0] = lf->id; else td->td_retval[0] = 0; out: sx_xunlock(&kld_sx); return (error); } int sys_kldstat(struct thread *td, struct kldstat_args *uap) { struct kld_file_stat *stat; int error, version; /* * Check the version of the user's structure. */ if ((error = copyin(&uap->stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct kld_file_stat_1) && version != sizeof(struct kld_file_stat)) return (EINVAL); stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO); error = kern_kldstat(td, uap->fileid, stat); if (error == 0) error = copyout(stat, uap->stat, version); free(stat, M_TEMP); return (error); } int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) { linker_file_t lf; int namelen; #ifdef MAC int error; error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf == NULL) { sx_xunlock(&kld_sx); return (ENOENT); } /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; if (namelen > sizeof(stat->name)) namelen = sizeof(stat->name); bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; stat->address = lf->address; stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; if (namelen > sizeof(stat->pathname)) namelen = sizeof(stat->pathname); bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); td->td_retval[0] = 0; return (0); } #ifdef DDB -DB_COMMAND(kldstat, db_kldstat) +DB_COMMAND_FLAGS(kldstat, db_kldstat, DB_CMD_MEMSAFE) { linker_file_t lf; #define POINTER_WIDTH ((int)(sizeof(void *) * 2 + 2)) db_printf("Id Refs Address%*c Size Name\n", POINTER_WIDTH - 7, ' '); #undef POINTER_WIDTH TAILQ_FOREACH(lf, &linker_files, link) { if (db_pager_quit) return; db_printf("%2d %4d %p %-8zx %s\n", lf->id, lf->refs, lf->address, lf->size, lf->filename); } } #endif /* DDB */ int sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) { linker_file_t lf; module_t mp; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(uap->fileid); if (lf) { MOD_SLOCK; mp = TAILQ_FIRST(&lf->modules); if (mp != NULL) td->td_retval[0] = module_getid(mp); else td->td_retval[0] = 0; MOD_SUNLOCK; } else error = ENOENT; sx_xunlock(&kld_sx); return (error); } int sys_kldsym(struct thread *td, struct kldsym_args *uap) { char *symstr = NULL; c_linker_sym_t sym; linker_symval_t symval; linker_file_t lf; struct kld_sym_lookup lookup; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0) return (error); if (lookup.version != sizeof(lookup) || uap->cmd != KLDSYM_LOOKUP) return (EINVAL); symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) goto out; sx_xlock(&kld_sx); if (uap->fileid != 0) { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) error = ENOENT; else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t) symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); } else error = ENOENT; } else { TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); break; } } if (lf == NULL) error = ENOENT; } sx_xunlock(&kld_sx); out: free(symstr, M_TEMP); return (error); } /* * Preloaded module support */ static modlist_t modlist_lookup(const char *name, int ver) { modlist_t mod; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) == 0 && (ver == 0 || mod->version == ver)) return (mod); } return (NULL); } static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo) { modlist_t mod, bestmod; int ver; if (verinfo == NULL) return (modlist_lookup(name, 0)); bestmod = NULL; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) != 0) continue; ver = mod->version; if (ver == verinfo->md_ver_preferred) return (mod); if (ver >= verinfo->md_ver_minimum && ver <= verinfo->md_ver_maximum && (bestmod == NULL || ver > bestmod->version)) bestmod = mod; } return (bestmod); } static modlist_t modlist_newmodule(const char *modname, int version, linker_file_t container) { modlist_t mod; mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); if (mod == NULL) panic("no memory for module list"); mod->container = container; mod->name = modname; mod->version = version; TAILQ_INSERT_TAIL(&found_modules, mod, link); return (mod); } static void linker_addmodules(linker_file_t lf, struct mod_metadata **start, struct mod_metadata **stop, int preload) { struct mod_metadata *mp, **mdp; const char *modname; int ver; for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; if (modlist_lookup(modname, ver) != NULL) { printf("module %s already present!\n", modname); /* XXX what can we do? this is a build error. :-( */ continue; } modlist_newmodule(modname, ver, lf); } } static void linker_preload(void *arg) { caddr_t modptr; const char *modname, *nmodname; char *modtype; linker_file_t lf, nlf; linker_class_t lc; int error; linker_file_list_t loaded_files; linker_file_list_t depended_files; struct mod_metadata *mp, *nmp; struct mod_metadata **start, **stop, **mdp, **nmdp; const struct mod_depend *verinfo; int nver; int resolves; modlist_t mod; struct sysinit **si_start, **si_stop; TAILQ_INIT(&loaded_files); TAILQ_INIT(&depended_files); TAILQ_INIT(&found_modules); error = 0; modptr = NULL; sx_xlock(&kld_sx); while ((modptr = preload_search_next_name(modptr)) != NULL) { modname = (char *)preload_search_info(modptr, MODINFO_NAME); modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); if (modname == NULL) { printf("Preloaded module at %p does not have a" " name!\n", modptr); continue; } if (modtype == NULL) { printf("Preloaded module at %p does not have a type!\n", modptr); continue; } if (bootverbose) printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); lf = NULL; TAILQ_FOREACH(lc, &classes, link) { error = LINKER_LINK_PRELOAD(lc, modname, &lf); if (!error) break; lf = NULL; } if (lf) TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); } /* * First get a list of stuff in the kernel. */ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, &stop, NULL) == 0) linker_addmodules(linker_kernel_file, start, stop, 1); /* * This is a once-off kinky bubble sort to resolve relocation * dependency requirements. */ restart: TAILQ_FOREACH(lf, &loaded_files, loaded) { error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); /* * First, look to see if we would successfully link with this * stuff. */ resolves = 1; /* unless we know otherwise */ if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop) /* it's a self reference */ continue; /* * ok, the module isn't here yet, we * are not finished */ if (modlist_lookup2(modname, verinfo) == NULL) resolves = 0; } } /* * OK, if we found our modules, we can link. So, "provide" * the modules inside and add it to the end of the link order * list. */ if (resolves) { if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; nver = ((const struct mod_version *) mp->md_data)->mv_version; if (modlist_lookup(modname, nver) != NULL) { printf("module %s already" " present!\n", modname); TAILQ_REMOVE(&loaded_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); /* we changed tailq next ptr */ goto restart; } modlist_newmodule(modname, nver, lf); } } TAILQ_REMOVE(&loaded_files, lf, loaded); TAILQ_INSERT_TAIL(&depended_files, lf, loaded); /* * Since we provided modules, we need to restart the * sort so that the previous files that depend on us * have a chance. Also, we've busted the tailq next * pointer with the REMOVE. */ goto restart; } } /* * At this point, we check to see what could not be resolved.. */ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) { TAILQ_REMOVE(&loaded_files, lf, loaded); printf("KLD file %s is missing dependencies\n", lf->filename); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } /* * We made it. Finish off the linking in the order we determined. */ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) { if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) panic("cannot add dependency"); } error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { printf("KLD file %s - cannot find " "dependency \"%s\"\n", lf->filename, modname); goto fail; } /* Don't count self-dependencies */ if (lf == mod->container) continue; mod->container->refs++; error = linker_file_add_dependency(lf, mod->container); if (error) panic("cannot add dependency"); } } /* * Now do relocation etc using the symbol search paths * established by the dependencies */ error = LINKER_LINK_PRELOAD_FINISH(lf); if (error) { printf("KLD file %s - could not finalize loading\n", lf->filename); goto fail; } linker_file_register_modules(lf); if (!TAILQ_EMPTY(&lf->modules)) lf->flags |= LINKER_FILE_MODULES; if (linker_file_lookup_set(lf, "sysinit_set", &si_start, &si_stop, NULL) == 0) sysinit_add(si_start, si_stop); linker_file_register_sysctls(lf, true); lf->flags |= LINKER_FILE_LINKED; continue; fail: TAILQ_REMOVE(&depended_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } sx_xunlock(&kld_sx); /* woohoo! we made it! */ } SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, NULL); /* * Handle preload files that failed to load any modules. */ static void linker_preload_finish(void *arg) { linker_file_t lf, nlf; sx_xlock(&kld_sx); TAILQ_FOREACH_SAFE(lf, &linker_files, link, nlf) { /* * If all of the modules in this file failed to load, unload * the file and return an error of ENOEXEC. (Parity with * linker_load_file.) */ if ((lf->flags & LINKER_FILE_MODULES) != 0 && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); continue; } lf->flags &= ~LINKER_FILE_MODULES; lf->userrefs++; /* so we can (try to) kldunload it */ } sx_xunlock(&kld_sx); } /* * Attempt to run after all DECLARE_MODULE SYSINITs. Unfortunately they can be * scheduled at any subsystem and order, so run this as late as possible. init * becomes runnable in SI_SUB_KTHREAD_INIT, so go slightly before that. */ SYSINIT(preload_finish, SI_SUB_KTHREAD_INIT - 100, SI_ORDER_MIDDLE, linker_preload_finish, NULL); /* * Search for a not-loaded module by name. * * Modules may be found in the following locations: * * - preloaded (result is just the module name) - on disk (result is full path * to module) * * If the module name is qualified in any way (contains path, etc.) the we * simply return a copy of it. * * The search path can be manipulated via sysctl. Note that we use the ';' * character as a separator to be consistent with the bootloader. */ static char linker_hintfile[] = "linker.hints"; static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules"; SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path, sizeof(linker_path), "module load search path"); TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); static const char * const linker_ext_list[] = { "", ".ko", NULL }; /* * Check if file actually exists either with or without extension listed in * the linker_ext_list. (probably should be generic for the rest of the * kernel) */ static char * linker_lookup_file(const char *path, int pathlen, const char *name, int namelen, struct vattr *vap) { struct nameidata nd; struct thread *td = curthread; /* XXX */ const char * const *cpp, *sep; char *result; int error, len, extlen, reclen, flags; enum vtype type; extlen = 0; for (cpp = linker_ext_list; *cpp; cpp++) { len = strlen(*cpp); if (len > extlen) extlen = len; } extlen++; /* trailing '\0' */ sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = pathlen + strlen(sep) + namelen + extlen + 1; result = malloc(reclen, M_LINKER, M_WAITOK); for (cpp = linker_ext_list; *cpp; cpp++) { snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, namelen, name, *cpp); /* * Attempt to open the file, and return the path if * we succeed and it's a regular file. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error == 0) { NDFREE_PNBUF(&nd); type = nd.ni_vp->v_type; if (vap) VOP_GETATTR(nd.ni_vp, vap, td->td_ucred); VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (type == VREG) return (result); } } free(result, M_LINKER); return (NULL); } #define INT_ALIGN(base, ptr) ptr = \ (base) + roundup2((ptr) - (base), sizeof(int)) /* * Lookup KLD which contains requested module in the "linker.hints" file. If * version specification is available, then try to find the best KLD. * Otherwise just find the latest one. */ static char * linker_hints_lookup(const char *path, int pathlen, const char *modname, int modnamelen, const struct mod_depend *verinfo) { struct thread *td = curthread; /* XXX */ struct ucred *cred = td ? td->td_ucred : NULL; struct nameidata nd; struct vattr vattr, mattr; const char *best, *sep; u_char *hints = NULL; u_char *cp, *recptr, *bufend, *result, *pathbuf; int error, ival, bestver, *intp, found, flags, clen, blen; ssize_t reclen; result = NULL; bestver = found = 0; sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + strlen(sep) + 1; pathbuf = malloc(reclen, M_LINKER, M_WAITOK); snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, linker_hintfile); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) goto bad; NDFREE_PNBUF(&nd); if (nd.ni_vp->v_type != VREG) goto bad; best = cp = NULL; error = VOP_GETATTR(nd.ni_vp, &vattr, cred); if (error) goto bad; /* * XXX: we need to limit this number to some reasonable value */ if (vattr.va_size > LINKER_HINTS_MAX) { printf("linker.hints file too large %ld\n", (long)vattr.va_size); goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) goto bad; VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, cred, td); nd.ni_vp = NULL; if (reclen != 0) { printf("can't read %zd\n", reclen); goto bad; } intp = (int *)hints; ival = *intp++; if (ival != LINKER_HINTS_VERSION) { printf("linker.hints file version mismatch %d\n", ival); goto bad; } bufend = hints + vattr.va_size; recptr = (u_char *)intp; clen = blen = 0; while (recptr < bufend && !found) { intp = (int *)recptr; reclen = *intp++; ival = *intp++; cp = (char *)intp; switch (ival) { case MDT_VERSION: clen = *cp++; if (clen != modnamelen || bcmp(cp, modname, clen) != 0) break; cp += clen; INT_ALIGN(hints, cp); ival = *(int *)cp; cp += sizeof(int); clen = *cp++; if (verinfo == NULL || ival == verinfo->md_ver_preferred) { found = 1; break; } if (ival >= verinfo->md_ver_minimum && ival <= verinfo->md_ver_maximum && ival > bestver) { bestver = ival; best = cp; blen = clen; } break; default: break; } recptr += reclen + sizeof(int); } /* * Finally check if KLD is in the place */ if (found) result = linker_lookup_file(path, pathlen, cp, clen, &mattr); else if (best) result = linker_lookup_file(path, pathlen, best, blen, &mattr); /* * KLD is newer than hints file. What we should do now? */ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) printf("warning: KLD '%s' is newer than the linker.hints" " file\n", result); bad: free(pathbuf, M_LINKER); if (hints) free(hints, M_TEMP); if (nd.ni_vp != NULL) { VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, cred, td); } /* * If nothing found or hints is absent - fallback to the old * way by using "kldname[.ko]" as module name. */ if (!found && !bestver && result == NULL) result = linker_lookup_file(path, pathlen, modname, modnamelen, NULL); return (result); } /* * Lookup KLD which contains requested module in the all directories. */ static char * linker_search_module(const char *modname, int modnamelen, const struct mod_depend *verinfo) { char *cp, *ep, *result; /* * traverse the linker path */ for (cp = linker_path; *cp; cp = ep + 1) { /* find the end of this component */ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); result = linker_hints_lookup(cp, ep - cp, modname, modnamelen, verinfo); if (result != NULL) return (result); if (*ep == 0) break; } return (NULL); } /* * Search for module in all directories listed in the linker_path. */ static char * linker_search_kld(const char *name) { char *cp, *ep, *result; int len; /* qualified at all? */ if (strchr(name, '/')) return (strdup(name, M_LINKER)); /* traverse the linker path */ len = strlen(name); for (ep = linker_path; *ep; ep++) { cp = ep; /* find the end of this component */ for (; *ep != 0 && *ep != ';'; ep++); result = linker_lookup_file(cp, ep - cp, name, len, NULL); if (result != NULL) return (result); } return (NULL); } static const char * linker_basename(const char *path) { const char *filename; filename = strrchr(path, '/'); if (filename == NULL) return path; if (filename[1]) filename++; return (filename); } #ifdef HWPMC_HOOKS /* * Inform hwpmc about the set of kernel modules currently loaded. */ void * linker_hwpmc_list_objects(void) { linker_file_t lf; struct pmckern_map_in *kobase; int i, nmappings; nmappings = 0; sx_slock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) nmappings++; /* Allocate nmappings + 1 entries. */ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(lf, &linker_files, link) { /* Save the info for this linker file. */ kobase[i].pm_file = lf->filename; kobase[i].pm_address = (uintptr_t)lf->address; i++; } sx_sunlock(&kld_sx); KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?")); /* The last entry of the malloced area comprises of all zeros. */ KASSERT(kobase[i].pm_file == NULL, ("linker_hwpmc_list_objects: last object not NULL")); return ((void *)kobase); } #endif /* check if root file system is not mounted */ static bool linker_root_mounted(void) { struct pwd *pwd; bool ret; if (rootvnode == NULL) return (false); pwd = pwd_hold(curthread); ret = pwd->pwd_rdir != NULL; pwd_drop(pwd); return (ret); } /* * Find a file which contains given module and load it, if "parent" is not * NULL, register a reference to it. */ static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp) { linker_file_t lfdep; const char *filename; char *pathname; int error; sx_assert(&kld_sx, SA_XLOCKED); if (modname == NULL) { /* * We have to load KLD */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); if (!linker_root_mounted()) return (ENXIO); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); if (!linker_root_mounted()) return (ENXIO); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); else /* * Need to find a KLD with required module */ pathname = linker_search_module(modname, strlen(modname), verinfo); } if (pathname == NULL) return (ENOENT); /* * Can't load more than one file with the same basename XXX: * Actually it should be possible to have multiple KLDs with * the same basename but different path because they can * provide different versions of the same modules. */ filename = linker_basename(pathname); if (linker_find_file_by_name(filename)) error = EEXIST; else do { error = linker_load_file(pathname, &lfdep); if (error) break; if (modname && verinfo && modlist_lookup2(modname, verinfo) == NULL) { linker_file_unload(lfdep, LINKER_UNLOAD_FORCE); error = ENOENT; break; } if (parent) { error = linker_file_add_dependency(parent, lfdep); if (error) break; } if (lfpp) *lfpp = lfdep; } while (0); free(pathname, M_LINKER); return (error); } /* * This routine is responsible for finding dependencies of userland initiated * kldload(2)'s of files. */ int linker_load_dependencies(linker_file_t lf) { linker_file_t lfdep; struct mod_metadata **start, **stop, **mdp, **nmdp; struct mod_metadata *mp, *nmp; const struct mod_depend *verinfo; modlist_t mod; const char *modname, *nmodname; int ver, error = 0; /* * All files are dependent on /kernel. */ sx_assert(&kld_sx, SA_XLOCKED); if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) return (error); } if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL) != 0) return (0); for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; mod = modlist_lookup(modname, ver); if (mod != NULL) { printf("interface %s.%d already present in the KLD" " '%s'!\n", modname, ver, mod->container->filename); return (EEXIST); } } for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; nmodname = NULL; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop)/* early exit, it's a self reference */ continue; mod = modlist_lookup2(modname, verinfo); if (mod) { /* woohoo, it's loaded already */ lfdep = mod->container; lfdep->refs++; error = linker_file_add_dependency(lf, lfdep); if (error) break; continue; } error = linker_load_module(NULL, modname, lf, verinfo, NULL); if (error) { printf("KLD %s: depends on %s - not available or" " version mismatch\n", lf->filename, modname); break; } } if (error) return (error); linker_addmodules(lf, start, stop, 0); return (error); } static int sysctl_kern_function_list_iterate(const char *name, void *opaque) { struct sysctl_req *req; req = opaque; return (SYSCTL_OUT(req, name, strlen(name) + 1)); } /* * Export a nul-separated, double-nul-terminated list of all function names * in the kernel. */ static int sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) { linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(req->td->td_ucred); if (error) return (error); #endif error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { error = LINKER_EACH_FUNCTION_NAME(lf, sysctl_kern_function_list_iterate, req); if (error) { sx_xunlock(&kld_sx); return (error); } } sx_xunlock(&kld_sx); return (SYSCTL_OUT(req, "", 1)); } SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_function_list, "", "kernel function list"); diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index eb70e4e6cd57..6bdcb48c5061 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -1,1579 +1,1579 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1987, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2005-2009 Robert N. M. Watson * Copyright (c) 2008 Otto Moerbeek (mallocarray) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 */ /* * Kernel malloc(9) implementation -- general purpose kernel memory allocator * based on memory types. Back end is implemented using the UMA(9) zone * allocator. A set of fixed-size buckets are used for smaller allocations, * and a special UMA allocation interface is used for larger allocations. * Callers declare memory types, and statistics are maintained independently * for each memory type. Statistics are maintained per-CPU for performance * reasons. See malloc(9) and comments in malloc.h for a detailed * description. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EPOCH_TRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #ifdef DEBUG_REDZONE #include #endif #if defined(INVARIANTS) && defined(__i386__) #include #endif #include #ifdef KDTRACE_HOOKS #include bool __read_frequently dtrace_malloc_enabled; dtrace_malloc_probe_func_t __read_mostly dtrace_malloc_probe; #endif #if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) || \ defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE) #define MALLOC_DEBUG 1 #endif #if defined(KASAN) || defined(DEBUG_REDZONE) #define DEBUG_REDZONE_ARG_DEF , unsigned long osize #define DEBUG_REDZONE_ARG , osize #else #define DEBUG_REDZONE_ARG_DEF #define DEBUG_REDZONE_ARG #endif /* * When realloc() is called, if the new size is sufficiently smaller than * the old size, realloc() will allocate a new, smaller block to avoid * wasting memory. 'Sufficiently smaller' is defined as: newsize <= * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'. */ #ifndef REALLOC_FRACTION #define REALLOC_FRACTION 1 /* new block if <= half the size */ #endif /* * Centrally define some common malloc types. */ MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches"); MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory"); MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers"); static struct malloc_type *kmemstatistics; static int kmemcount; #define KMEM_ZSHIFT 4 #define KMEM_ZBASE 16 #define KMEM_ZMASK (KMEM_ZBASE - 1) #define KMEM_ZMAX 65536 #define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT) static uint8_t kmemsize[KMEM_ZSIZE + 1]; #ifndef MALLOC_DEBUG_MAXZONES #define MALLOC_DEBUG_MAXZONES 1 #endif static int numzones = MALLOC_DEBUG_MAXZONES; /* * Small malloc(9) memory allocations are allocated from a set of UMA buckets * of various sizes. * * Warning: the layout of the struct is duplicated in libmemstat for KVM support. * * XXX: The comment here used to read "These won't be powers of two for * long." It's possible that a significant amount of wasted memory could be * recovered by tuning the sizes of these buckets. */ struct { int kz_size; const char *kz_name; uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES]; } kmemzones[] = { {16, "malloc-16", }, {32, "malloc-32", }, {64, "malloc-64", }, {128, "malloc-128", }, {256, "malloc-256", }, {384, "malloc-384", }, {512, "malloc-512", }, {1024, "malloc-1024", }, {2048, "malloc-2048", }, {4096, "malloc-4096", }, {8192, "malloc-8192", }, {16384, "malloc-16384", }, {32768, "malloc-32768", }, {65536, "malloc-65536", }, {0, NULL}, }; u_long vm_kmem_size; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0, "Size of kernel memory"); static u_long kmem_zmax = KMEM_ZMAX; SYSCTL_ULONG(_vm, OID_AUTO, kmem_zmax, CTLFLAG_RDTUN, &kmem_zmax, 0, "Maximum allocation size that malloc(9) would use UMA as backend"); static u_long vm_kmem_size_min; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0, "Minimum size of kernel memory"); static u_long vm_kmem_size_max; SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0, "Maximum size of kernel memory"); static u_int vm_kmem_size_scale; SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0, "Scale factor for kernel memory size"); static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size, CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmem_map_size, "LU", "Current kmem allocation size"); static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free, CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmem_map_free, "LU", "Free space in kmem"); static SYSCTL_NODE(_vm, OID_AUTO, malloc, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Malloc information"); static u_int vm_malloc_zone_count = nitems(kmemzones); SYSCTL_UINT(_vm_malloc, OID_AUTO, zone_count, CTLFLAG_RD, &vm_malloc_zone_count, 0, "Number of malloc zones"); static int sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm_malloc, OID_AUTO, zone_sizes, CTLFLAG_RD | CTLTYPE_OPAQUE | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_malloc_zone_sizes, "S", "Zone sizes used by malloc"); /* * The malloc_mtx protects the kmemstatistics linked list. */ struct mtx malloc_mtx; static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS); #if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1) static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Kernel malloc debugging options"); #endif /* * malloc(9) fault injection -- cause malloc failures every (n) mallocs when * the caller specifies M_NOWAIT. If set to 0, no failures are caused. */ #ifdef MALLOC_MAKE_FAILURES static int malloc_failure_rate; static int malloc_nowait_count; static int malloc_failure_count; SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RWTUN, &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail"); SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD, &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures"); #endif static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS) { u_long size; size = uma_size(); return (sysctl_handle_long(oidp, &size, 0, req)); } static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS) { u_long size, limit; /* The sysctl is unsigned, implement as a saturation value. */ size = uma_size(); limit = uma_limit(); if (size > limit) size = 0; else size = limit - size; return (sysctl_handle_long(oidp, &size, 0, req)); } static int sysctl_vm_malloc_zone_sizes(SYSCTL_HANDLER_ARGS) { int sizes[nitems(kmemzones)]; int i; for (i = 0; i < nitems(kmemzones); i++) { sizes[i] = kmemzones[i].kz_size; } return (SYSCTL_OUT(req, &sizes, sizeof(sizes))); } /* * malloc(9) uma zone separation -- sub-page buffer overruns in one * malloc type will affect only a subset of other malloc types. */ #if MALLOC_DEBUG_MAXZONES > 1 static void tunable_set_numzones(void) { TUNABLE_INT_FETCH("debug.malloc.numzones", &numzones); /* Sanity check the number of malloc uma zones. */ if (numzones <= 0) numzones = 1; if (numzones > MALLOC_DEBUG_MAXZONES) numzones = MALLOC_DEBUG_MAXZONES; } SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL); SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &numzones, 0, "Number of malloc uma subzones"); /* * Any number that changes regularly is an okay choice for the * offset. Build numbers are pretty good of you have them. */ static u_int zone_offset = __FreeBSD_version; TUNABLE_INT("debug.malloc.zone_offset", &zone_offset); SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN, &zone_offset, 0, "Separate malloc types by examining the " "Nth character in the malloc type short description."); static void mtp_set_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; const char *desc; size_t len; u_int val; mtip = &mtp->ks_mti; desc = mtp->ks_shortdesc; if (desc == NULL || (len = strlen(desc)) == 0) val = 0; else val = desc[zone_offset % len]; mtip->mti_zone = (val % numzones); } static inline u_int mtp_get_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; mtip = &mtp->ks_mti; KASSERT(mtip->mti_zone < numzones, ("mti_zone %u out of range %d", mtip->mti_zone, numzones)); return (mtip->mti_zone); } #elif MALLOC_DEBUG_MAXZONES == 0 #error "MALLOC_DEBUG_MAXZONES must be positive." #else static void mtp_set_subzone(struct malloc_type *mtp) { struct malloc_type_internal *mtip; mtip = &mtp->ks_mti; mtip->mti_zone = 0; } static inline u_int mtp_get_subzone(struct malloc_type *mtp) { return (0); } #endif /* MALLOC_DEBUG_MAXZONES > 1 */ /* * An allocation has succeeded -- update malloc type statistics for the * amount of bucket size. Occurs within a critical section so that the * thread isn't preempted and doesn't migrate while updating per-PCU * statistics. */ static void malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size, int zindx) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; critical_enter(); mtip = &mtp->ks_mti; mtsp = zpcpu_get(mtip->mti_stats); if (size > 0) { mtsp->mts_memalloced += size; mtsp->mts_numallocs++; } if (zindx != -1) mtsp->mts_size |= 1 << zindx; #ifdef KDTRACE_HOOKS if (__predict_false(dtrace_malloc_enabled)) { uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC]; if (probe_id != 0) (dtrace_malloc_probe)(probe_id, (uintptr_t) mtp, (uintptr_t) mtip, (uintptr_t) mtsp, size, zindx); } #endif critical_exit(); } void malloc_type_allocated(struct malloc_type *mtp, unsigned long size) { if (size > 0) malloc_type_zone_allocated(mtp, size, -1); } /* * A free operation has occurred -- update malloc type statistics for the * amount of the bucket size. Occurs within a critical section so that the * thread isn't preempted and doesn't migrate while updating per-CPU * statistics. */ void malloc_type_freed(struct malloc_type *mtp, unsigned long size) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; critical_enter(); mtip = &mtp->ks_mti; mtsp = zpcpu_get(mtip->mti_stats); mtsp->mts_memfreed += size; mtsp->mts_numfrees++; #ifdef KDTRACE_HOOKS if (__predict_false(dtrace_malloc_enabled)) { uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE]; if (probe_id != 0) (dtrace_malloc_probe)(probe_id, (uintptr_t) mtp, (uintptr_t) mtip, (uintptr_t) mtsp, size, 0); } #endif critical_exit(); } /* * contigmalloc: * * Allocate a block of physically contiguous memory. * * If M_NOWAIT is set, this routine will not block and return NULL if * the allocation fails. */ void * contigmalloc(unsigned long size, struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) { void *ret; ret = (void *)kmem_alloc_contig(size, flags, low, high, alignment, boundary, VM_MEMATTR_DEFAULT); if (ret != NULL) malloc_type_allocated(type, round_page(size)); return (ret); } void * contigmalloc_domainset(unsigned long size, struct malloc_type *type, struct domainset *ds, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) { void *ret; ret = (void *)kmem_alloc_contig_domainset(ds, size, flags, low, high, alignment, boundary, VM_MEMATTR_DEFAULT); if (ret != NULL) malloc_type_allocated(type, round_page(size)); return (ret); } /* * contigfree: * * Free a block of memory allocated by contigmalloc. * * This routine may not block. */ void contigfree(void *addr, unsigned long size, struct malloc_type *type) { kmem_free((vm_offset_t)addr, size); malloc_type_freed(type, round_page(size)); } #ifdef MALLOC_DEBUG static int malloc_dbg(caddr_t *vap, size_t *sizep, struct malloc_type *mtp, int flags) { #ifdef INVARIANTS int indx; KASSERT(mtp->ks_version == M_VERSION, ("malloc: bad malloc type version")); /* * Check that exactly one of M_WAITOK or M_NOWAIT is specified. */ indx = flags & (M_WAITOK | M_NOWAIT); if (indx != M_NOWAIT && indx != M_WAITOK) { static struct timeval lasterr; static int curerr, once; if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) { printf("Bad malloc flags: %x\n", indx); kdb_backtrace(); flags |= M_WAITOK; once++; } } #endif #ifdef MALLOC_MAKE_FAILURES if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) { atomic_add_int(&malloc_nowait_count, 1); if ((malloc_nowait_count % malloc_failure_rate) == 0) { atomic_add_int(&malloc_failure_count, 1); *vap = NULL; return (EJUSTRETURN); } } #endif if (flags & M_WAITOK) { KASSERT(curthread->td_intr_nesting_level == 0, ("malloc(M_WAITOK) in interrupt context")); if (__predict_false(!THREAD_CAN_SLEEP())) { #ifdef EPOCH_TRACE epoch_trace_list(curthread); #endif KASSERT(0, ("malloc(M_WAITOK) with sleeping prohibited")); } } KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("malloc: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if (memguard_cmp_mtp(mtp, *sizep)) { *vap = memguard_alloc(*sizep, flags); if (*vap != NULL) return (EJUSTRETURN); /* This is unfortunate but should not be fatal. */ } #endif #ifdef DEBUG_REDZONE *sizep = redzone_size_ntor(*sizep); #endif return (0); } #endif /* * Handle large allocations and frees by using kmem_malloc directly. */ static inline bool malloc_large_slab(uma_slab_t slab) { uintptr_t va; va = (uintptr_t)slab; return ((va & 1) != 0); } static inline size_t malloc_large_size(uma_slab_t slab) { uintptr_t va; va = (uintptr_t)slab; return (va >> 1); } static caddr_t __noinline malloc_large(size_t size, struct malloc_type *mtp, struct domainset *policy, int flags DEBUG_REDZONE_ARG_DEF) { vm_offset_t kva; caddr_t va; size = roundup(size, PAGE_SIZE); kva = kmem_malloc_domainset(policy, size, flags); if (kva != 0) { /* The low bit is unused for slab pointers. */ vsetzoneslab(kva, NULL, (void *)((size << 1) | 1)); uma_total_inc(size); } va = (caddr_t)kva; malloc_type_allocated(mtp, va == NULL ? 0 : size); if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); } else { #ifdef DEBUG_REDZONE va = redzone_setup(va, osize); #endif kasan_mark((void *)va, osize, size, KASAN_MALLOC_REDZONE); } return (va); } static void free_large(void *addr, size_t size) { kmem_free((vm_offset_t)addr, size); uma_total_dec(size); } /* * malloc: * * Allocate a block of memory. * * If M_NOWAIT is set, this routine will not block and return NULL if * the allocation fails. */ void * (malloc)(size_t size, struct malloc_type *mtp, int flags) { int indx; caddr_t va; uma_zone_t zone; #if defined(DEBUG_REDZONE) || defined(KASAN) unsigned long osize = size; #endif MPASS((flags & M_EXEC) == 0); #ifdef MALLOC_DEBUG va = NULL; if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif if (__predict_false(size > kmem_zmax)) return (malloc_large(size, mtp, DOMAINSET_RR(), flags DEBUG_REDZONE_ARG)); if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)]; va = uma_zalloc(zone, flags); if (va != NULL) { size = zone->uz_size; if ((flags & M_ZERO) == 0) { kmsan_mark(va, size, KMSAN_STATE_UNINIT); kmsan_orig(va, size, KMSAN_TYPE_MALLOC, KMSAN_RET_ADDR); } } malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); } #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); #endif #ifdef KASAN if (va != NULL) kasan_mark((void *)va, osize, size, KASAN_MALLOC_REDZONE); #endif return ((void *) va); } static void * malloc_domain(size_t *sizep, int *indxp, struct malloc_type *mtp, int domain, int flags) { uma_zone_t zone; caddr_t va; size_t size; int indx; size = *sizep; KASSERT(size <= kmem_zmax && (flags & M_EXEC) == 0, ("malloc_domain: Called with bad flag / size combination.")); if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; zone = kmemzones[indx].kz_zone[mtp_get_subzone(mtp)]; va = uma_zalloc_domain(zone, NULL, domain, flags); if (va != NULL) *sizep = zone->uz_size; *indxp = indx; return ((void *)va); } void * malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds, int flags) { struct vm_domainset_iter di; caddr_t va; int domain; int indx; #if defined(KASAN) || defined(DEBUG_REDZONE) unsigned long osize = size; #endif MPASS((flags & M_EXEC) == 0); #ifdef MALLOC_DEBUG va = NULL; if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif if (__predict_false(size > kmem_zmax)) return (malloc_large(size, mtp, DOMAINSET_RR(), flags DEBUG_REDZONE_ARG)); vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { va = malloc_domain(&size, &indx, mtp, domain, flags); } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); } #ifdef DEBUG_REDZONE if (va != NULL) va = redzone_setup(va, osize); #endif #ifdef KASAN if (va != NULL) kasan_mark((void *)va, osize, size, KASAN_MALLOC_REDZONE); #endif #ifdef KMSAN if ((flags & M_ZERO) == 0) { kmsan_mark(va, size, KMSAN_STATE_UNINIT); kmsan_orig(va, size, KMSAN_TYPE_MALLOC, KMSAN_RET_ADDR); } #endif return (va); } /* * Allocate an executable area. */ void * malloc_exec(size_t size, struct malloc_type *mtp, int flags) { return (malloc_domainset_exec(size, mtp, DOMAINSET_RR(), flags)); } void * malloc_domainset_exec(size_t size, struct malloc_type *mtp, struct domainset *ds, int flags) { #if defined(DEBUG_REDZONE) || defined(KASAN) unsigned long osize = size; #endif #ifdef MALLOC_DEBUG caddr_t va; #endif flags |= M_EXEC; #ifdef MALLOC_DEBUG va = NULL; if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif return (malloc_large(size, mtp, ds, flags DEBUG_REDZONE_ARG)); } void * malloc_aligned(size_t size, size_t align, struct malloc_type *type, int flags) { return (malloc_domainset_aligned(size, align, type, DOMAINSET_RR(), flags)); } void * malloc_domainset_aligned(size_t size, size_t align, struct malloc_type *mtp, struct domainset *ds, int flags) { void *res; size_t asize; KASSERT(powerof2(align), ("malloc_domainset_aligned: wrong align %#zx size %#zx", align, size)); KASSERT(align <= PAGE_SIZE, ("malloc_domainset_aligned: align %#zx (size %#zx) too large", align, size)); /* * Round the allocation size up to the next power of 2, * because we can only guarantee alignment for * power-of-2-sized allocations. Further increase the * allocation size to align if the rounded size is less than * align, since malloc zones provide alignment equal to their * size. */ if (size == 0) size = 1; asize = size <= align ? align : 1UL << flsl(size - 1); res = malloc_domainset(asize, mtp, ds, flags); KASSERT(res == NULL || ((uintptr_t)res & (align - 1)) == 0, ("malloc_domainset_aligned: result not aligned %p size %#zx " "allocsize %#zx align %#zx", res, size, asize, align)); return (res); } void * mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) { if (WOULD_OVERFLOW(nmemb, size)) panic("mallocarray: %zu * %zu overflowed", nmemb, size); return (malloc(size * nmemb, type, flags)); } void * mallocarray_domainset(size_t nmemb, size_t size, struct malloc_type *type, struct domainset *ds, int flags) { if (WOULD_OVERFLOW(nmemb, size)) panic("mallocarray_domainset: %zu * %zu overflowed", nmemb, size); return (malloc_domainset(size * nmemb, type, ds, flags)); } #if defined(INVARIANTS) && !defined(KASAN) static void free_save_type(void *addr, struct malloc_type *mtp, u_long size) { struct malloc_type **mtpp = addr; /* * Cache a pointer to the malloc_type that most recently freed * this memory here. This way we know who is most likely to * have stepped on it later. * * This code assumes that size is a multiple of 8 bytes for * 64 bit machines */ mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR); mtpp += (size - sizeof(struct malloc_type *)) / sizeof(struct malloc_type *); *mtpp = mtp; } #endif #ifdef MALLOC_DEBUG static int free_dbg(void **addrp, struct malloc_type *mtp) { void *addr; addr = *addrp; KASSERT(mtp->ks_version == M_VERSION, ("free: bad malloc type version")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("free: called with spinlock or critical section held")); /* free(NULL, ...) does nothing */ if (addr == NULL) return (EJUSTRETURN); #ifdef DEBUG_MEMGUARD if (is_memguard_addr(addr)) { memguard_free(addr); return (EJUSTRETURN); } #endif #ifdef DEBUG_REDZONE redzone_check(addr); *addrp = redzone_addr_ntor(addr); #endif return (0); } #endif /* * free: * * Free a block of memory allocated by malloc. * * This routine may not block. */ void free(void *addr, struct malloc_type *mtp) { uma_zone_t zone; uma_slab_t slab; u_long size; #ifdef MALLOC_DEBUG if (free_dbg(&addr, mtp) != 0) return; #endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; vtozoneslab((vm_offset_t)addr & (~UMA_SLAB_MASK), &zone, &slab); if (slab == NULL) panic("free: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (__predict_true(!malloc_large_slab(slab))) { size = zone->uz_size; #if defined(INVARIANTS) && !defined(KASAN) free_save_type(addr, mtp, size); #endif uma_zfree_arg(zone, addr, slab); } else { size = malloc_large_size(slab); free_large(addr, size); } malloc_type_freed(mtp, size); } /* * zfree: * * Zero then free a block of memory allocated by malloc. * * This routine may not block. */ void zfree(void *addr, struct malloc_type *mtp) { uma_zone_t zone; uma_slab_t slab; u_long size; #ifdef MALLOC_DEBUG if (free_dbg(&addr, mtp) != 0) return; #endif /* free(NULL, ...) does nothing */ if (addr == NULL) return; vtozoneslab((vm_offset_t)addr & (~UMA_SLAB_MASK), &zone, &slab); if (slab == NULL) panic("free: address %p(%p) has not been allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (__predict_true(!malloc_large_slab(slab))) { size = zone->uz_size; #if defined(INVARIANTS) && !defined(KASAN) free_save_type(addr, mtp, size); #endif kasan_mark(addr, size, size, 0); explicit_bzero(addr, size); uma_zfree_arg(zone, addr, slab); } else { size = malloc_large_size(slab); kasan_mark(addr, size, size, 0); explicit_bzero(addr, size); free_large(addr, size); } malloc_type_freed(mtp, size); } /* * realloc: change the size of a memory block */ void * realloc(void *addr, size_t size, struct malloc_type *mtp, int flags) { #ifndef DEBUG_REDZONE uma_zone_t zone; uma_slab_t slab; #endif unsigned long alloc; void *newaddr; KASSERT(mtp->ks_version == M_VERSION, ("realloc: bad malloc type version")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("realloc: called with spinlock or critical section held")); /* realloc(NULL, ...) is equivalent to malloc(...) */ if (addr == NULL) return (malloc(size, mtp, flags)); /* * XXX: Should report free of old memory and alloc of new memory to * per-CPU stats. */ #ifdef DEBUG_MEMGUARD if (is_memguard_addr(addr)) return (memguard_realloc(addr, size, mtp, flags)); #endif #ifdef DEBUG_REDZONE alloc = redzone_get_size(addr); #else vtozoneslab((vm_offset_t)addr & (~UMA_SLAB_MASK), &zone, &slab); /* Sanity check */ KASSERT(slab != NULL, ("realloc: address %p out of range", (void *)addr)); /* Get the size of the original block */ if (!malloc_large_slab(slab)) alloc = zone->uz_size; else alloc = malloc_large_size(slab); /* Reuse the original block if appropriate */ if (size <= alloc && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE)) { kasan_mark((void *)addr, size, alloc, KASAN_MALLOC_REDZONE); return (addr); } #endif /* !DEBUG_REDZONE */ /* Allocate a new, bigger (or smaller) block */ if ((newaddr = malloc(size, mtp, flags)) == NULL) return (NULL); /* * Copy over original contents. For KASAN, the redzone must be marked * valid before performing the copy. */ kasan_mark(addr, alloc, alloc, 0); bcopy(addr, newaddr, min(size, alloc)); free(addr, mtp); return (newaddr); } /* * reallocf: same as realloc() but free memory on failure. */ void * reallocf(void *addr, size_t size, struct malloc_type *mtp, int flags) { void *mem; if ((mem = realloc(addr, size, mtp, flags)) == NULL) free(addr, mtp); return (mem); } /* * malloc_size: returns the number of bytes allocated for a request of the * specified size */ size_t malloc_size(size_t size) { int indx; if (size > kmem_zmax) return (0); if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; return (kmemzones[indx].kz_size); } /* * malloc_usable_size: returns the usable size of the allocation. */ size_t malloc_usable_size(const void *addr) { #ifndef DEBUG_REDZONE uma_zone_t zone; uma_slab_t slab; #endif u_long size; if (addr == NULL) return (0); #ifdef DEBUG_MEMGUARD if (is_memguard_addr(__DECONST(void *, addr))) return (memguard_get_req_size(addr)); #endif #ifdef DEBUG_REDZONE size = redzone_get_size(__DECONST(void *, addr)); #else vtozoneslab((vm_offset_t)addr & (~UMA_SLAB_MASK), &zone, &slab); if (slab == NULL) panic("malloc_usable_size: address %p(%p) is not allocated.\n", addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); if (!malloc_large_slab(slab)) size = zone->uz_size; else size = malloc_large_size(slab); #endif /* * Unmark the redzone to avoid reports from consumers who are * (presumably) about to use the full allocation size. */ kasan_mark(addr, size, size, 0); return (size); } CTASSERT(VM_KMEM_SIZE_SCALE >= 1); /* * Initialize the kernel memory (kmem) arena. */ void kmeminit(void) { u_long mem_size; u_long tmp; #ifdef VM_KMEM_SIZE if (vm_kmem_size == 0) vm_kmem_size = VM_KMEM_SIZE; #endif #ifdef VM_KMEM_SIZE_MIN if (vm_kmem_size_min == 0) vm_kmem_size_min = VM_KMEM_SIZE_MIN; #endif #ifdef VM_KMEM_SIZE_MAX if (vm_kmem_size_max == 0) vm_kmem_size_max = VM_KMEM_SIZE_MAX; #endif /* * Calculate the amount of kernel virtual address (KVA) space that is * preallocated to the kmem arena. In order to support a wide range * of machines, it is a function of the physical memory size, * specifically, * * min(max(physical memory size / VM_KMEM_SIZE_SCALE, * VM_KMEM_SIZE_MIN), VM_KMEM_SIZE_MAX) * * Every architecture must define an integral value for * VM_KMEM_SIZE_SCALE. However, the definitions of VM_KMEM_SIZE_MIN * and VM_KMEM_SIZE_MAX, which represent respectively the floor and * ceiling on this preallocation, are optional. Typically, * VM_KMEM_SIZE_MAX is itself a function of the available KVA space on * a given architecture. */ mem_size = vm_cnt.v_page_count; if (mem_size <= 32768) /* delphij XXX 128MB */ kmem_zmax = PAGE_SIZE; if (vm_kmem_size_scale < 1) vm_kmem_size_scale = VM_KMEM_SIZE_SCALE; /* * Check if we should use defaults for the "vm_kmem_size" * variable: */ if (vm_kmem_size == 0) { vm_kmem_size = mem_size / vm_kmem_size_scale; vm_kmem_size = vm_kmem_size * PAGE_SIZE < vm_kmem_size ? vm_kmem_size_max : vm_kmem_size * PAGE_SIZE; if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) vm_kmem_size = vm_kmem_size_min; if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max) vm_kmem_size = vm_kmem_size_max; } if (vm_kmem_size == 0) panic("Tune VM_KMEM_SIZE_* for the platform"); /* * The amount of KVA space that is preallocated to the * kmem arena can be set statically at compile-time or manually * through the kernel environment. However, it is still limited to * twice the physical memory size, which has been sufficient to handle * the most severe cases of external fragmentation in the kmem arena. */ if (vm_kmem_size / 2 / PAGE_SIZE > mem_size) vm_kmem_size = 2 * mem_size * PAGE_SIZE; vm_kmem_size = round_page(vm_kmem_size); /* * With KASAN or KMSAN enabled, dynamically allocated kernel memory is * shadowed. Account for this when setting the UMA limit. */ #if defined(KASAN) vm_kmem_size = (vm_kmem_size * KASAN_SHADOW_SCALE) / (KASAN_SHADOW_SCALE + 1); #elif defined(KMSAN) vm_kmem_size /= 3; #endif #ifdef DEBUG_MEMGUARD tmp = memguard_fudge(vm_kmem_size, kernel_map); #else tmp = vm_kmem_size; #endif uma_set_limit(tmp); #ifdef DEBUG_MEMGUARD /* * Initialize MemGuard if support compiled in. MemGuard is a * replacement allocator used for detecting tamper-after-free * scenarios as they occur. It is only used for debugging. */ memguard_init(kernel_arena); #endif } /* * Initialize the kernel memory allocator */ /* ARGSUSED*/ static void mallocinit(void *dummy) { int i; uint8_t indx; mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF); kmeminit(); if (kmem_zmax < PAGE_SIZE || kmem_zmax > KMEM_ZMAX) kmem_zmax = KMEM_ZMAX; for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) { int size = kmemzones[indx].kz_size; const char *name = kmemzones[indx].kz_name; size_t align; int subzone; align = UMA_ALIGN_PTR; if (powerof2(size) && size > sizeof(void *)) align = MIN(size, PAGE_SIZE) - 1; for (subzone = 0; subzone < numzones; subzone++) { kmemzones[indx].kz_zone[subzone] = uma_zcreate(name, size, #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, #else NULL, NULL, NULL, NULL, #endif align, UMA_ZONE_MALLOC); } for (;i <= size; i+= KMEM_ZBASE) kmemsize[i >> KMEM_ZSHIFT] = indx; } } SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_SECOND, mallocinit, NULL); void malloc_init(void *data) { struct malloc_type_internal *mtip; struct malloc_type *mtp; KASSERT(vm_cnt.v_page_count != 0, ("malloc_register before vm_init")); mtp = data; if (mtp->ks_version != M_VERSION) panic("malloc_init: type %s with unsupported version %lu", mtp->ks_shortdesc, mtp->ks_version); mtip = &mtp->ks_mti; mtip->mti_stats = uma_zalloc_pcpu(pcpu_zone_64, M_WAITOK | M_ZERO); mtp_set_subzone(mtp); mtx_lock(&malloc_mtx); mtp->ks_next = kmemstatistics; kmemstatistics = mtp; kmemcount++; mtx_unlock(&malloc_mtx); } void malloc_uninit(void *data) { struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp; struct malloc_type *mtp, *temp; long temp_allocs, temp_bytes; int i; mtp = data; KASSERT(mtp->ks_version == M_VERSION, ("malloc_uninit: bad malloc type version")); mtx_lock(&malloc_mtx); mtip = &mtp->ks_mti; if (mtp != kmemstatistics) { for (temp = kmemstatistics; temp != NULL; temp = temp->ks_next) { if (temp->ks_next == mtp) { temp->ks_next = mtp->ks_next; break; } } KASSERT(temp, ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc)); } else kmemstatistics = mtp->ks_next; kmemcount--; mtx_unlock(&malloc_mtx); /* * Look for memory leaks. */ temp_allocs = temp_bytes = 0; for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); temp_allocs += mtsp->mts_numallocs; temp_allocs -= mtsp->mts_numfrees; temp_bytes += mtsp->mts_memalloced; temp_bytes -= mtsp->mts_memfreed; } if (temp_allocs > 0 || temp_bytes > 0) { printf("Warning: memory type %s leaked memory on destroy " "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc, temp_allocs, temp_bytes); } uma_zfree_pcpu(pcpu_zone_64, mtip->mti_stats); } struct malloc_type * malloc_desc2type(const char *desc) { struct malloc_type *mtp; mtx_assert(&malloc_mtx, MA_OWNED); for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { if (strcmp(mtp->ks_shortdesc, desc) == 0) return (mtp); } return (NULL); } static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS) { struct malloc_type_stream_header mtsh; struct malloc_type_internal *mtip; struct malloc_type_stats *mtsp, zeromts; struct malloc_type_header mth; struct malloc_type *mtp; int error, i; struct sbuf sbuf; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); mtx_lock(&malloc_mtx); bzero(&zeromts, sizeof(zeromts)); /* * Insert stream header. */ bzero(&mtsh, sizeof(mtsh)); mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION; mtsh.mtsh_maxcpus = MAXCPU; mtsh.mtsh_count = kmemcount; (void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh)); /* * Insert alternating sequence of type headers and type statistics. */ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { mtip = &mtp->ks_mti; /* * Insert type header. */ bzero(&mth, sizeof(mth)); strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME); (void)sbuf_bcat(&sbuf, &mth, sizeof(mth)); /* * Insert type statistics for each CPU. */ for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); (void)sbuf_bcat(&sbuf, mtsp, sizeof(*mtsp)); } /* * Fill in the missing CPUs. */ for (; i < MAXCPU; i++) { (void)sbuf_bcat(&sbuf, &zeromts, sizeof(zeromts)); } } mtx_unlock(&malloc_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats", "Return malloc types"); SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0, "Count of kernel malloc types"); void malloc_type_list(malloc_type_list_func_t *func, void *arg) { struct malloc_type *mtp, **bufmtp; int count, i; size_t buflen; mtx_lock(&malloc_mtx); restart: mtx_assert(&malloc_mtx, MA_OWNED); count = kmemcount; mtx_unlock(&malloc_mtx); buflen = sizeof(struct malloc_type *) * count; bufmtp = malloc(buflen, M_TEMP, M_WAITOK); mtx_lock(&malloc_mtx); if (count < kmemcount) { free(bufmtp, M_TEMP); goto restart; } for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++) bufmtp[i] = mtp; mtx_unlock(&malloc_mtx); for (i = 0; i < count; i++) (func)(bufmtp[i], arg); free(bufmtp, M_TEMP); } #ifdef DDB static int64_t get_malloc_stats(const struct malloc_type_internal *mtip, uint64_t *allocs, uint64_t *inuse) { const struct malloc_type_stats *mtsp; uint64_t frees, alloced, freed; int i; *allocs = 0; frees = 0; alloced = 0; freed = 0; for (i = 0; i <= mp_maxid; i++) { mtsp = zpcpu_get_cpu(mtip->mti_stats, i); *allocs += mtsp->mts_numallocs; frees += mtsp->mts_numfrees; alloced += mtsp->mts_memalloced; freed += mtsp->mts_memfreed; } *inuse = *allocs - frees; return (alloced - freed); } -DB_SHOW_COMMAND(malloc, db_show_malloc) +DB_SHOW_COMMAND_FLAGS(malloc, db_show_malloc, DB_CMD_MEMSAFE) { const char *fmt_hdr, *fmt_entry; struct malloc_type *mtp; uint64_t allocs, inuse; int64_t size; /* variables for sorting */ struct malloc_type *last_mtype, *cur_mtype; int64_t cur_size, last_size; int ties; if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jdK,%ju\n"; } else { fmt_hdr = "%18s %12s %12s %12s\n"; fmt_entry = "%18s %12ju %12jdK %12ju\n"; } db_printf(fmt_hdr, "Type", "InUse", "MemUse", "Requests"); /* Select sort, largest size first. */ last_mtype = NULL; last_size = INT64_MAX; for (;;) { cur_mtype = NULL; cur_size = -1; ties = 0; for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { /* * In the case of size ties, print out mtypes * in the order they are encountered. That is, * when we encounter the most recently output * mtype, we have already printed all preceding * ties, and we must print all following ties. */ if (mtp == last_mtype) { ties = 1; continue; } size = get_malloc_stats(&mtp->ks_mti, &allocs, &inuse); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_mtype = mtp; } } if (cur_mtype == NULL) break; size = get_malloc_stats(&cur_mtype->ks_mti, &allocs, &inuse); db_printf(fmt_entry, cur_mtype->ks_shortdesc, inuse, howmany(size, 1024), allocs); if (db_pager_quit) break; last_mtype = cur_mtype; last_size = cur_size; } } #if MALLOC_DEBUG_MAXZONES > 1 DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches) { struct malloc_type_internal *mtip; struct malloc_type *mtp; u_int subzone; if (!have_addr) { db_printf("Usage: show multizone_matches \n"); return; } mtp = (void *)addr; if (mtp->ks_version != M_VERSION) { db_printf("Version %lx does not match expected %x\n", mtp->ks_version, M_VERSION); return; } mtip = &mtp->ks_mti; subzone = mtip->mti_zone; for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) { mtip = &mtp->ks_mti; if (mtip->mti_zone != subzone) continue; db_printf("%s\n", mtp->ks_shortdesc); if (db_pager_quit) break; } } #endif /* MALLOC_DEBUG_MAXZONES > 1 */ #endif /* DDB */ diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 4962e83c4ad4..20e0c1fe3b45 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -1,3564 +1,3564 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #endif SDT_PROVIDER_DEFINE(proc); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread); static void pgdelete(struct pgrp *); static int pgrp_init(void *mem, int size, int flags); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); static void pargs_free(struct pargs *pa); /* * Other process lists */ struct pidhashhead *pidhashtbl = NULL; struct sx *pidhashtbl_lock; u_long pidhash; u_long pidhashlock; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc = LIST_HEAD_INITIALIZER(allproc); struct sx __exclusive_cache_line allproc_lock; struct sx __exclusive_cache_line proctree_lock; struct mtx __exclusive_cache_line ppeers_lock; struct mtx __exclusive_cache_line procid_lock; uma_zone_t proc_zone; uma_zone_t pgrp_zone; /* * The offset of various fields in struct proc and struct thread. * These are used by kernel debuggers to enumerate kernel threads and * processes. */ const int proc_off_p_pid = offsetof(struct proc, p_pid); const int proc_off_p_comm = offsetof(struct proc, p_comm); const int proc_off_p_list = offsetof(struct proc, p_list); const int proc_off_p_hash = offsetof(struct proc, p_hash); const int proc_off_p_threads = offsetof(struct proc, p_threads); const int thread_off_td_tid = offsetof(struct thread, td_tid); const int thread_off_td_name = offsetof(struct thread, td_name); const int thread_off_td_oncpu = offsetof(struct thread, td_oncpu); const int thread_off_td_pcb = offsetof(struct thread, td_pcb); const int thread_off_td_plist = offsetof(struct thread, td_plist); EVENTHANDLER_LIST_DEFINE(process_ctor); EVENTHANDLER_LIST_DEFINE(process_dtor); EVENTHANDLER_LIST_DEFINE(process_init); EVENTHANDLER_LIST_DEFINE(process_fini); EVENTHANDLER_LIST_DEFINE(process_exit); EVENTHANDLER_LIST_DEFINE(process_fork); EVENTHANDLER_LIST_DEFINE(process_exec); int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "Kernel stack size in pages"); static int vmmap_skip_res_cnt = 0; SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW, &vmmap_skip_res_cnt, 0, "Skip calculation of the pages resident count in kern.proc.vmmap"); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #ifdef COMPAT_FREEBSD32 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE); #endif /* * Initialize global process hashing structures. */ void procinit(void) { u_long i; sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); mtx_init(&procid_lock, "procid", NULL, MTX_DEF); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pidhashlock = (pidhash + 1) / 64; if (pidhashlock > 0) pidhashlock--; pidhashtbl_lock = malloc(sizeof(*pidhashtbl_lock) * (pidhashlock + 1), M_PROC, M_WAITOK | M_ZERO); for (i = 0; i < pidhashlock + 1; i++) sx_init_flags(&pidhashtbl_lock[i], "pidhash", SX_DUPOK); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); pgrp_zone = uma_zcreate("PGRP", sizeof(struct pgrp), NULL, NULL, pgrp_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; struct thread *td; p = (struct proc *)mem; #ifdef KDTRACE_HOOKS kdtrace_proc_ctor(p); #endif EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); td = FIRST_THREAD_IN_PROC(p); if (td != NULL) { /* Make sure all thread constructors are executed */ EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); } return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); if (td != NULL) { #ifdef INVARIANTS KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); /* Make sure all thread destructors are executed */ EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td); } EVENTHANDLER_DIRECT_INVOKE(process_dtor, p); #ifdef KDTRACE_HOOKS kdtrace_proc_dtor(p); #endif if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; p = (struct proc *)mem; mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW); cv_init(&p->p_pwait, "ppwait"); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); p->p_stats = pstats_alloc(); p->p_pgrp = NULL; return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; EVENTHANDLER_DIRECT_INVOKE(process_fini, p); pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } static int pgrp_init(void *mem, int size, int flags) { struct pgrp *pg; pg = mem; mtx_init(&pg->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); return (0); } /* * PID space management. * * These bitmaps are used by fork_findpid. */ bitstr_t bit_decl(proc_id_pidmap, PID_MAX); bitstr_t bit_decl(proc_id_grpidmap, PID_MAX); bitstr_t bit_decl(proc_id_sessidmap, PID_MAX); bitstr_t bit_decl(proc_id_reapmap, PID_MAX); static bitstr_t *proc_id_array[] = { proc_id_pidmap, proc_id_grpidmap, proc_id_sessidmap, proc_id_reapmap, }; void proc_id_set(int type, pid_t id) { KASSERT(type >= 0 && type < nitems(proc_id_array), ("invalid type %d\n", type)); mtx_lock(&procid_lock); KASSERT(bit_test(proc_id_array[type], id) == 0, ("bit %d already set in %d\n", id, type)); bit_set(proc_id_array[type], id); mtx_unlock(&procid_lock); } void proc_id_set_cond(int type, pid_t id) { KASSERT(type >= 0 && type < nitems(proc_id_array), ("invalid type %d\n", type)); if (bit_test(proc_id_array[type], id)) return; mtx_lock(&procid_lock); bit_set(proc_id_array[type], id); mtx_unlock(&procid_lock); } void proc_id_clear(int type, pid_t id) { KASSERT(type >= 0 && type < nitems(proc_id_array), ("invalid type %d\n", type)); mtx_lock(&procid_lock); KASSERT(bit_test(proc_id_array[type], id) != 0, ("bit %d not set in %d\n", id, type)); bit_clear(proc_id_array[type], id); mtx_unlock(&procid_lock); } /* * Is p an inferior of the current process? */ int inferior(struct proc *p) { sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) return (0); } return (1); } /* * Shared lock all the pid hash lists. */ void pidhash_slockall(void) { u_long i; for (i = 0; i < pidhashlock + 1; i++) sx_slock(&pidhashtbl_lock[i]); } /* * Shared unlock all the pid hash lists. */ void pidhash_sunlockall(void) { u_long i; for (i = 0; i < pidhashlock + 1; i++) sx_sunlock(&pidhashtbl_lock[i]); } /* * Similar to pfind_any(), this function finds zombies. */ struct proc * pfind_any_locked(pid_t pid) { struct proc *p; sx_assert(PIDHASHLOCK(pid), SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); p = NULL; } break; } } return (p); } /* * Locate a process by number. * * By not returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ static __always_inline struct proc * _pfind(pid_t pid, bool zombie) { struct proc *p; p = curproc; if (p->p_pid == pid) { PROC_LOCK(p); return (p); } sx_slock(PIDHASHLOCK(pid)); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); if (p->p_state == PRS_NEW || (!zombie && p->p_state == PRS_ZOMBIE)) { PROC_UNLOCK(p); p = NULL; } break; } } sx_sunlock(PIDHASHLOCK(pid)); return (p); } struct proc * pfind(pid_t pid) { return (_pfind(pid, false)); } /* * Same as pfind but allow zombies. */ struct proc * pfind_any(pid_t pid) { return (_pfind(pid, true)); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pid_t pgid) { struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Locate process and do additional manipulations, depending on flags. */ int pget(pid_t pid, int flags, struct proc **pp) { struct proc *p; struct thread *td1; int error; p = curproc; if (p->p_pid == pid) { PROC_LOCK(p); } else { p = NULL; if (pid <= PID_MAX) { if ((flags & PGET_NOTWEXIT) == 0) p = pfind_any(pid); else p = pfind(pid); } else if ((flags & PGET_NOTID) == 0) { td1 = tdfind(pid, -1); if (td1 != NULL) p = td1->td_proc; } if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { error = p_cansee(curthread, p); if (error != 0) goto errout; } } if ((flags & PGET_CANDEBUG) != 0) { error = p_candebug(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_ISCURRENT) != 0 && curproc != p) { error = EPERM; goto errout; } if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto errout; } if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) { /* * XXXRW: Not clear ESRCH is the right error during proc * execve(). */ error = ESRCH; goto errout; } if ((flags & PGET_HOLD) != 0) { _PHOLD(p); PROC_UNLOCK(p); } *pp = p; return (0); errout: PROC_UNLOCK(p); return (error); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess) { sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); KASSERT(pgfind(pgid) == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; proc_id_set(PROC_ID_SESSION, p->p_pid); refcount_init(&sess->s_count, 1); sess->s_ttyvp = NULL; sess->s_ttydp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; sess_hold(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; proc_id_set(PROC_ID_GROUP, p->p_pid); LIST_INIT(&pgrp->pg_members); pgrp->pg_flags = 0; /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(struct proc *p, struct pgrp *pgrp) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p proc %p\n", __func__, pgrp->pg_session, p->p_session, p)); KASSERT(pgrp != p->p_pgrp, ("%s: p %p belongs to pgrp %p", __func__, p, pgrp)); doenterpgrp(p, pgrp); return (0); } /* * If true, any child of q which belongs to group pgrp, qualifies the * process group pgrp as not orphaned. */ static bool isjobproc(struct proc *q, struct pgrp *pgrp) { sx_assert(&proctree_lock, SX_LOCKED); return (q->p_pgrp != pgrp && q->p_pgrp->pg_session == pgrp->pg_session); } static struct proc * jobc_reaper(struct proc *p) { struct proc *pp; sx_assert(&proctree_lock, SA_LOCKED); for (pp = p;;) { pp = pp->p_reaper; if (pp->p_reaper == pp || (pp->p_treeflag & P_TREE_GRPEXITED) == 0) return (pp); } } static struct proc * jobc_parent(struct proc *p, struct proc *p_exiting) { struct proc *pp; sx_assert(&proctree_lock, SA_LOCKED); pp = proc_realparent(p); if (pp->p_pptr == NULL || pp == p_exiting || (pp->p_treeflag & P_TREE_GRPEXITED) == 0) return (pp); return (jobc_reaper(pp)); } static int pgrp_calc_jobc(struct pgrp *pgrp) { struct proc *q; int cnt; #ifdef INVARIANTS if (!mtx_owned(&pgrp->pg_mtx)) sx_assert(&proctree_lock, SA_LOCKED); #endif cnt = 0; LIST_FOREACH(q, &pgrp->pg_members, p_pglist) { if ((q->p_treeflag & P_TREE_GRPEXITED) != 0 || q->p_pptr == NULL) continue; if (isjobproc(jobc_parent(q, NULL), pgrp)) cnt++; } return (cnt); } /* * Move p to a process group */ static void doenterpgrp(struct proc *p, struct pgrp *pgrp) { struct pgrp *savepgrp; struct proc *pp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; pp = jobc_parent(p, NULL); PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); if (isjobproc(pp, savepgrp) && pgrp_calc_jobc(savepgrp) == 1) orphanpg(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); if (isjobproc(pp, pgrp)) pgrp->pg_flags &= ~PGRP_ORPHANED; PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(struct proc *p) { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(struct pgrp *pgrp) { struct session *savesess; struct tty *tp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. The proctree lock ensures that * new sigio structures will not be added after this point. */ funsetownlst(&pgrp->pg_sigiolst); PGRP_LOCK(pgrp); tp = pgrp->pg_session->s_ttyp; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; PGRP_UNLOCK(pgrp); /* Remove the reference to the pgrp before deallocating it. */ if (tp != NULL) { tty_lock(tp); tty_rel_pgrp(tp, pgrp); } proc_id_clear(PROC_ID_GROUP, pgrp->pg_id); uma_zfree(pgrp_zone, pgrp); sess_release(savesess); } static void fixjobc_kill(struct proc *p) { struct proc *q; struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); pgrp = p->p_pgrp; PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * p no longer affects process group orphanage for children. * It is marked by the flag because p is only physically * removed from its process group on wait(2). */ MPASS((p->p_treeflag & P_TREE_GRPEXITED) == 0); p->p_treeflag |= P_TREE_GRPEXITED; /* * Check if exiting p orphans its own group. */ pgrp = p->p_pgrp; if (isjobproc(jobc_parent(p, NULL), pgrp)) { PGRP_LOCK(pgrp); if (pgrp_calc_jobc(pgrp) == 0) orphanpg(pgrp); PGRP_UNLOCK(pgrp); } /* * Check this process' children to see whether they qualify * their process groups after reparenting to reaper. */ LIST_FOREACH(q, &p->p_children, p_sibling) { pgrp = q->p_pgrp; PGRP_LOCK(pgrp); if (pgrp_calc_jobc(pgrp) == 0) { /* * We want to handle exactly the children that * has p as realparent. Then, when calculating * jobc_parent for children, we should ignore * P_TREE_GRPEXITED flag already set on p. */ if (jobc_parent(q, p) == p && isjobproc(p, pgrp)) orphanpg(pgrp); } else pgrp->pg_flags &= ~PGRP_ORPHANED; PGRP_UNLOCK(pgrp); } LIST_FOREACH(q, &p->p_orphans, p_orphan) { pgrp = q->p_pgrp; PGRP_LOCK(pgrp); if (pgrp_calc_jobc(pgrp) == 0) { if (isjobproc(p, pgrp)) orphanpg(pgrp); } else pgrp->pg_flags &= ~PGRP_ORPHANED; PGRP_UNLOCK(pgrp); } } void killjobc(void) { struct session *sp; struct tty *tp; struct proc *p; struct vnode *ttyvp; p = curproc; MPASS(p->p_flag & P_WEXIT); sx_assert(&proctree_lock, SX_LOCKED); if (SESS_LEADER(p)) { sp = p->p_session; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_ttydp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp); } devfs_ctty_unref(ttyvp); sx_xlock(&proctree_lock); } } fixjobc_kill(p); } /* * A process group has become orphaned, mark it as such for signal * delivery code. If there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(struct pgrp *pg) { struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); pg->pg_flags |= PGRP_ORPHANED; LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p) == P_STOPPED_SIG) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); kern_psignal(p, SIGHUP); kern_psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sess_hold(struct session *s) { refcount_acquire(&s->s_count); } void sess_release(struct session *s) { if (refcount_release(&s->s_count)) { if (s->s_ttyp != NULL) { tty_lock(s->s_ttyp); tty_rel_sess(s->s_ttyp, s); } proc_id_clear(PROC_ID_SESSION, s->s_sid); mtx_destroy(&s->s_mtx); free(s, M_SESSION); } } #ifdef DDB static void db_print_pgrp_one(struct pgrp *pgrp, struct proc *p) { db_printf( " pid %d at %p pr %d pgrp %p e %d jc %d\n", p->p_pid, p, p->p_pptr == NULL ? -1 : p->p_pptr->p_pid, p->p_pgrp, (p->p_treeflag & P_TREE_GRPEXITED) != 0, p->p_pptr == NULL ? 0 : isjobproc(p->p_pptr, pgrp)); } -DB_SHOW_COMMAND(pgrpdump, pgrpdump) +DB_SHOW_COMMAND_FLAGS(pgrpdump, pgrpdump, DB_CMD_MEMSAFE) { struct pgrp *pgrp; struct proc *p; int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { db_printf("indx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { db_printf( " pgrp %p, pgid %d, sess %p, sesscnt %d, mem %p\n", pgrp, (int)pgrp->pg_id, pgrp->pg_session, pgrp->pg_session->s_count, LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) db_print_pgrp_one(pgrp, p); } } } } #endif /* DDB */ /* * Calculate the kinfo_proc members which contain process-wide * informations. * Must be called with the target process locked. */ static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_estcpu = 0; kp->ki_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); kp->ki_pctcpu += sched_pctcpu(td); kp->ki_estcpu += sched_estcpu(td); thread_unlock(td); } } /* * Fill in any information that is common to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct ucred *cred; struct sigacts *ps; struct timeval boottime; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; kp->ki_addr =/* p->p_addr; */0; /* XXX */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = ktr_get_tracevp(p, false); kp->ki_traceflag = p->p_traceflag; #endif kp->ki_fd = p->p_fd; kp->ki_pd = p->p_pd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; kp->ki_flag2 = p->p_flag2; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; kp->ki_cr_flags = 0; if (cred->cr_flags & CRED_FLAG_CAPMODE) kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE; /* XXX bde doesn't like KI_NGROUPS */ if (cred->cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = cred->cr_ngroups; bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside the jail, use 0 as a jail ID. */ if (cred->cr_prison != curthread->td_ucred->cr_prison) kp->ki_jid = cred->cr_prison->pr_id; } strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name, sizeof(kp->ki_loginclass)); } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; kp->ki_fibnum = p->p_fibnum; kp->ki_start = p->p_stats->p_start; getboottime(&boottime); timevaladd(&kp->ki_start, &boottime); PROC_STATLOCK(p); rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_STATUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child times in a single value. */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); FOREACH_THREAD_IN_PROC(p, td0) kp->ki_cow += td0->td_cow; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) { kp->ki_ppid = p->p_oppid; if (p->p_flag & P_TRACED) kp->ki_tracer = p->p_pptr->p_pid; } } /* * Fill job-related process information. */ static void fill_kinfo_proc_pgrp(struct proc *p, struct kinfo_proc *kp) { struct tty *tp; struct session *sp; struct pgrp *pgrp; sx_assert(&proctree_lock, SA_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); pgrp = p->p_pgrp; if (pgrp == NULL) return; kp->ki_pgid = pgrp->pg_id; kp->ki_jobc = pgrp_calc_jobc(pgrp); sp = pgrp->pg_session; tp = NULL; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; tp = sp->s_ttyp; SESS_UNLOCK(sp); } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = tty_udev(tp); kp->ki_tdev_freebsd11 = kp->ki_tdev; /* truncate */ kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else { kp->ki_tdev = NODEV; kp->ki_tdev_freebsd11 = kp->ki_tdev; /* truncate */ } } /* * Fill in information that is thread specific. Must be called with * target process locked. If 'preferthread' is set, overwrite certain * process-related fields that are maintained for both threads and * processes. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread) { struct proc *p; p = td->td_proc; kp->ki_tdaddr = td; PROC_LOCK_ASSERT(p, MA_OWNED); if (preferthread) PROC_STATLOCK(p); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); if (strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname)) >= sizeof(kp->ki_tdname)) { strlcpy(kp->ki_moretdname, td->td_name + sizeof(kp->ki_tdname) - 1, sizeof(kp->ki_moretdname)); } else { bzero(kp->ki_moretdname, sizeof(kp->ki_moretdname)); } if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* approximate. */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; /* * Note: legacy fields; clamp at the old NOCPU value and/or * the maximum u_char CPU value. */ if (td->td_lastcpu == NOCPU) kp->ki_lastcpu_old = NOCPU_OLD; else if (td->td_lastcpu > MAXCPU_OLD) kp->ki_lastcpu_old = MAXCPU_OLD; else kp->ki_lastcpu_old = td->td_lastcpu; if (td->td_oncpu == NOCPU) kp->ki_oncpu_old = NOCPU_OLD; else if (td->td_oncpu > MAXCPU_OLD) kp->ki_oncpu_old = MAXCPU_OLD; else kp->ki_oncpu_old = td->td_oncpu; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; if (preferthread) { rufetchtd(td, &kp->ki_rusage); kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime); kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = sched_estcpu(td); kp->ki_cow = td->td_cow; } /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; if (preferthread) kp->ki_siglist = td->td_siglist; kp->ki_sigmask = td->td_sigmask; thread_unlock(td); if (preferthread) PROC_STATUNLOCK(p); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { MPASS(FIRST_THREAD_IN_PROC(p) != NULL); bzero(kp, sizeof(*kp)); fill_kinfo_proc_pgrp(p,kp); fill_kinfo_proc_only(p, kp); fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0); fill_kinfo_aggregate(p, kp); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } #ifdef COMPAT_FREEBSD32 /* * This function is typically used to copy out the kernel address, so * it can be replaced by assignment of zero. */ static inline uint32_t ptr32_trim(const void *ptr) { uintptr_t uptr; uptr = (uintptr_t)ptr; return ((uptr > UINT_MAX) ? 0 : uptr); } #define PTRTRIM_CP(src,dst,fld) \ do { (dst).fld = ptr32_trim((src).fld); } while (0) static void freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32) { int i; bzero(ki32, sizeof(struct kinfo_proc32)); ki32->ki_structsize = sizeof(struct kinfo_proc32); CP(*ki, *ki32, ki_layout); PTRTRIM_CP(*ki, *ki32, ki_args); PTRTRIM_CP(*ki, *ki32, ki_paddr); PTRTRIM_CP(*ki, *ki32, ki_addr); PTRTRIM_CP(*ki, *ki32, ki_tracep); PTRTRIM_CP(*ki, *ki32, ki_textvp); PTRTRIM_CP(*ki, *ki32, ki_fd); PTRTRIM_CP(*ki, *ki32, ki_vmspace); PTRTRIM_CP(*ki, *ki32, ki_wchan); CP(*ki, *ki32, ki_pid); CP(*ki, *ki32, ki_ppid); CP(*ki, *ki32, ki_pgid); CP(*ki, *ki32, ki_tpgid); CP(*ki, *ki32, ki_sid); CP(*ki, *ki32, ki_tsid); CP(*ki, *ki32, ki_jobc); CP(*ki, *ki32, ki_tdev); CP(*ki, *ki32, ki_tdev_freebsd11); CP(*ki, *ki32, ki_siglist); CP(*ki, *ki32, ki_sigmask); CP(*ki, *ki32, ki_sigignore); CP(*ki, *ki32, ki_sigcatch); CP(*ki, *ki32, ki_uid); CP(*ki, *ki32, ki_ruid); CP(*ki, *ki32, ki_svuid); CP(*ki, *ki32, ki_rgid); CP(*ki, *ki32, ki_svgid); CP(*ki, *ki32, ki_ngroups); for (i = 0; i < KI_NGROUPS; i++) CP(*ki, *ki32, ki_groups[i]); CP(*ki, *ki32, ki_size); CP(*ki, *ki32, ki_rssize); CP(*ki, *ki32, ki_swrss); CP(*ki, *ki32, ki_tsize); CP(*ki, *ki32, ki_dsize); CP(*ki, *ki32, ki_ssize); CP(*ki, *ki32, ki_xstat); CP(*ki, *ki32, ki_acflag); CP(*ki, *ki32, ki_pctcpu); CP(*ki, *ki32, ki_estcpu); CP(*ki, *ki32, ki_slptime); CP(*ki, *ki32, ki_swtime); CP(*ki, *ki32, ki_cow); CP(*ki, *ki32, ki_runtime); TV_CP(*ki, *ki32, ki_start); TV_CP(*ki, *ki32, ki_childtime); CP(*ki, *ki32, ki_flag); CP(*ki, *ki32, ki_kiflag); CP(*ki, *ki32, ki_traceflag); CP(*ki, *ki32, ki_stat); CP(*ki, *ki32, ki_nice); CP(*ki, *ki32, ki_lock); CP(*ki, *ki32, ki_rqindex); CP(*ki, *ki32, ki_oncpu); CP(*ki, *ki32, ki_lastcpu); /* XXX TODO: wrap cpu value as appropriate */ CP(*ki, *ki32, ki_oncpu_old); CP(*ki, *ki32, ki_lastcpu_old); bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1); bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1); bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1); bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1); bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1); bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1); bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1); bcopy(ki->ki_moretdname, ki32->ki_moretdname, MAXCOMLEN - TDNAMLEN + 1); CP(*ki, *ki32, ki_tracer); CP(*ki, *ki32, ki_flag2); CP(*ki, *ki32, ki_fibnum); CP(*ki, *ki32, ki_cr_flags); CP(*ki, *ki32, ki_jid); CP(*ki, *ki32, ki_numthreads); CP(*ki, *ki32, ki_tid); CP(*ki, *ki32, ki_pri); freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage); freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch); PTRTRIM_CP(*ki, *ki32, ki_pcb); PTRTRIM_CP(*ki, *ki32, ki_kstack); PTRTRIM_CP(*ki, *ki32, ki_udata); PTRTRIM_CP(*ki, *ki32, ki_tdaddr); CP(*ki, *ki32, ki_sflag); CP(*ki, *ki32, ki_tdflags); } #endif static ssize_t kern_proc_out_size(struct proc *p, int flags) { ssize_t size = 0; PROC_LOCK_ASSERT(p, MA_OWNED); if ((flags & KERN_PROC_NOTHREADS) != 0) { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { size += sizeof(struct kinfo_proc32); } else #endif size += sizeof(struct kinfo_proc); } else { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) size += sizeof(struct kinfo_proc32) * p->p_numthreads; else #endif size += sizeof(struct kinfo_proc) * p->p_numthreads; } PROC_UNLOCK(p); return (size); } int kern_proc_out(struct proc *p, struct sbuf *sb, int flags) { struct thread *td; struct kinfo_proc ki; #ifdef COMPAT_FREEBSD32 struct kinfo_proc32 ki32; #endif int error; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(FIRST_THREAD_IN_PROC(p) != NULL); error = 0; fill_kinfo_proc(p, &ki); if ((flags & KERN_PROC_NOTHREADS) != 0) { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; } else { FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &ki, 1); #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; if (error != 0) break; } } PROC_UNLOCK(p); return (error); } static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags) { struct sbuf sb; struct kinfo_proc ki; int error, error2; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, kern_proc_out_size(p, flags))); sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = kern_proc_out(p, &sb, flags); error2 = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0) return (error); else if (error2 != 0) return (error2); return (0); } int proc_iterate(int (*cb)(struct proc *, void *), void *cbarg) { struct proc *p; int error, i, j; for (i = 0; i < pidhashlock + 1; i++) { sx_slock(&proctree_lock); sx_slock(&pidhashtbl_lock[i]); for (j = i; j <= pidhash; j += pidhashlock + 1) { LIST_FOREACH(p, &pidhashtbl[j], p_hash) { if (p->p_state == PRS_NEW) continue; error = cb(p, cbarg); PROC_LOCK_ASSERT(p, MA_NOTOWNED); if (error != 0) { sx_sunlock(&pidhashtbl_lock[i]); sx_sunlock(&proctree_lock); return (error); } } } sx_sunlock(&pidhashtbl_lock[i]); sx_sunlock(&proctree_lock); } return (0); } struct kern_proc_out_args { struct sysctl_req *req; int flags; int oid_number; int *name; }; static int sysctl_kern_proc_iterate(struct proc *p, void *origarg) { struct kern_proc_out_args *arg = origarg; int *name = arg->name; int oid_number = arg->oid_number; int flags = arg->flags; struct sysctl_req *req = arg->req; int error = 0; PROC_LOCK(p); KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) goto skip; /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) goto skip; break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) goto skip; break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) goto skip; break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) goto skip; break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) goto skip; /* XXX proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); goto skip; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) goto skip; break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) goto skip; break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags); PROC_LOCK_ASSERT(p, MA_NOTOWNED); return (error); skip: PROC_UNLOCK(p); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { struct kern_proc_out_args iterarg; int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) flags |= KERN_PROC_MASK32; #endif if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); sx_slock(&proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags); sx_sunlock(&proctree_lock); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (req->oldptr == NULL) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } else { error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); } iterarg.flags = flags; iterarg.oid_number = oid_number; iterarg.req = req; iterarg.name = name; error = proc_iterate(sysctl_kern_proc_iterate, &iterarg); return (error); } struct pargs * pargs_alloc(int len) { struct pargs *pa; pa = malloc(sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } static void pargs_free(struct pargs *pa) { free(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } static int proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf, size_t len) { ssize_t n; /* * This may return a short read if the string is shorter than the chunk * and is aligned at the end of the page, and the following page is not * mapped. */ n = proc_readmem(td, p, (vm_offset_t)sptr, buf, len); if (n <= 0) return (ENOMEM); return (0); } #define PROC_AUXV_MAX 256 /* Safety limit on auxv size. */ enum proc_vector_type { PROC_ARG, PROC_ENV, PROC_AUX, }; #ifdef COMPAT_FREEBSD32 static int get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct freebsd32_ps_strings pss; Elf32_Auxinfo aux; vm_offset_t vptr, ptr; uint32_t *proc_vector32; char **proc_vector; size_t vsize, size; int i, error; error = 0; if (proc_readmem(td, p, PROC_PS_STRINGS(p), &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)PTRIN(pss.ps_argvstr); vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_ENV: vptr = (vm_offset_t)PTRIN(pss.ps_envstr); vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_AUX: vptr = (vm_offset_t)PTRIN(pss.ps_envstr) + (pss.ps_nenvstr + 1) * sizeof(int32_t); if (vptr % 4 != 0) return (ENOEXEC); for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); } proc_vector32 = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector32, size) != size) { error = ENOMEM; goto done; } if (type == PROC_AUX) { *proc_vectorp = (char **)proc_vector32; *vsizep = vsize; return (0); } proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK); for (i = 0; i < (int)vsize; i++) proc_vector[i] = PTRIN(proc_vector32[i]); *proc_vectorp = proc_vector; *vsizep = vsize; done: free(proc_vector32, M_TEMP); return (error); } #endif static int get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct ps_strings pss; Elf_Auxinfo aux; vm_offset_t vptr, ptr; char **proc_vector; size_t vsize, size; int i; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) return (get_proc_vector32(td, p, proc_vectorp, vsizep, type)); #endif if (proc_readmem(td, p, PROC_PS_STRINGS(p), &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)pss.ps_argvstr; vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_ENV: vptr = (vm_offset_t)pss.ps_envstr; vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_AUX: /* * The aux array is just above env array on the stack. Check * that the address is naturally aligned. */ vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1) * sizeof(char *); #if __ELF_WORD_SIZE == 64 if (vptr % sizeof(uint64_t) != 0) #else if (vptr % sizeof(uint32_t) != 0) #endif return (ENOEXEC); /* * We count the array size reading the aux vectors from the * stack until AT_NULL vector is returned. So (to keep the code * simple) we read the process stack twice: the first time here * to find the size and the second time when copying the vectors * to the allocated proc_vector. */ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } /* * If the PROC_AUXV_MAX entries are iterated over, and we have * not reached AT_NULL, it is most likely we are reading wrong * data: either the process doesn't have auxv array or data has * been modified. Return the error in this case. */ if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); /* In case we are built without INVARIANTS. */ } proc_vector = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector, size) != size) { free(proc_vector, M_TEMP); return (ENOMEM); } *proc_vectorp = proc_vector; *vsizep = vsize; return (0); } #define GET_PS_STRINGS_CHUNK_SZ 256 /* Chunk size (bytes) for ps_strings operations. */ static int get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb, enum proc_vector_type type) { size_t done, len, nchr, vsize; int error, i; char **proc_vector, *sptr; char pss_string[GET_PS_STRINGS_CHUNK_SZ]; PROC_ASSERT_HELD(p); /* * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes. */ nchr = 2 * (PATH_MAX + ARG_MAX); error = get_proc_vector(td, p, &proc_vector, &vsize, type); if (error != 0) return (error); for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) { /* * The program may have scribbled into its argv array, e.g. to * remove some arguments. If that has happened, break out * before trying to read from NULL. */ if (proc_vector[i] == NULL) break; for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) { error = proc_read_string(td, p, sptr, pss_string, sizeof(pss_string)); if (error != 0) goto done; len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ); if (done + len >= nchr) len = nchr - done - 1; sbuf_bcat(sb, pss_string, len); if (len != GET_PS_STRINGS_CHUNK_SZ) break; done += GET_PS_STRINGS_CHUNK_SZ; } sbuf_bcat(sb, "", 1); done += len + 1; } done: free(proc_vector, M_TEMP); return (error); } int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ARG)); } int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ENV)); } int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb) { size_t vsize, size; char **auxv; int error; error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX); if (error == 0) { #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) size = vsize * sizeof(Elf32_Auxinfo); else #endif size = vsize * sizeof(Elf_Auxinfo); if (sbuf_bcat(sb, auxv, size) != 0) error = ENOMEM; free(auxv, M_TEMP); } return (error); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; struct sbuf sb; int flags, error = 0, error2; pid_t pid; if (namelen != 1) return (EINVAL); p = curproc; pid = (pid_t)name[0]; if (pid == -1) { pid = p->p_pid; } /* * If the query is for this process and it is single-threaded, there * is nobody to modify pargs, thus we can just read. */ if (pid == p->p_pid && p->p_numthreads == 1 && req->newptr == NULL && (pa = p->p_args) != NULL) return (SYSCTL_OUT(req, pa->ar_args, pa->ar_length)); flags = PGET_CANSEE; if (req->newptr != NULL) flags |= PGET_ISCURRENT; error = pget(pid, flags, &p); if (error) return (error); pa = p->p_args; if (pa != NULL) { pargs_hold(pa); PROC_UNLOCK(p); error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); } else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) { _PHOLD(p); PROC_UNLOCK(p); sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getargv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); if (error == 0 && error2 != 0) error = error2; } else { PROC_UNLOCK(p); } if (error != 0 || req->newptr == NULL) return (error); if (req->newlen > ps_arg_cache_limit - sizeof(struct pargs)) return (ENOMEM); if (req->newlen == 0) { /* * Clear the argument pointer, so that we'll fetch arguments * with proc_getargv() until further notice. */ newpa = NULL; } else { newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve environment of another process. */ static int sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getenvv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve ELF auxiliary vector of * another process. */ static int sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getauxv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * Look up the canonical executable path running in the specified process. * It tries to return the same hardlink name as was used for execve(2). * This allows the programs that modify their behavior based on their progname, * to operate correctly. * * Result is returned in retbuf, it must not be freed, similar to vn_fullpath() * calling conventions. * binname is a pointer to temporary string buffer of length MAXPATHLEN, * allocated and freed by caller. * freebuf should be freed by caller, from the M_TEMP malloc type. */ int proc_get_binpath(struct proc *p, char *binname, char **retbuf, char **freebuf) { struct nameidata nd; struct vnode *vp, *dvp; size_t freepath_size; int error; bool do_fullpath; PROC_LOCK_ASSERT(p, MA_OWNED); vp = p->p_textvp; if (vp == NULL) { PROC_UNLOCK(p); *retbuf = ""; *freebuf = NULL; return (0); } vref(vp); dvp = p->p_textdvp; if (dvp != NULL) vref(dvp); if (p->p_binname != NULL) strlcpy(binname, p->p_binname, MAXPATHLEN); PROC_UNLOCK(p); do_fullpath = true; *freebuf = NULL; if (dvp != NULL && binname[0] != '\0') { freepath_size = MAXPATHLEN; if (vn_fullpath_hardlink(vp, dvp, binname, strlen(binname), retbuf, freebuf, &freepath_size) == 0) { /* * Recheck the looked up path. The binary * might have been renamed or replaced, in * which case we should not report old name. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, *retbuf); error = namei(&nd); if (error == 0) { if (nd.ni_vp == vp) do_fullpath = false; vrele(nd.ni_vp); NDFREE_PNBUF(&nd); } } } if (do_fullpath) { free(*freebuf, M_TEMP); *freebuf = NULL; error = vn_fullpath(vp, retbuf, freebuf); } vrele(vp); if (dvp != NULL) vrele(dvp); return (error); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; char *retbuf, *freebuf, *binname; int error; if (arglen != 1) return (EINVAL); binname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); binname[0] = '\0'; if (*pidp == -1) { /* -1 means this process */ error = 0; p = req->td->td_proc; PROC_LOCK(p); } else { error = pget(*pidp, PGET_CANSEE, &p); } if (error == 0) error = proc_get_binpath(p, binname, &retbuf, &freebuf); free(binname, M_TEMP); if (error != 0) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error != 0) return (error); sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } #ifdef KINFO_OVMENTRY_SIZE CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE); #endif #ifdef COMPAT_FREEBSD7 static int sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS) { vm_map_entry_t entry, tmp_entry; unsigned int last_timestamp, namelen; char *fullpath, *freepath; struct kinfo_ovmentry *kve; struct vattr va; struct ucred *cred; int error, *name; struct vnode *vp; struct proc *p; vm_map_t map; struct vmspace *vm; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); map = &vm->vm_map; vm_map_lock_read(map); VM_MAP_ENTRY_FOREACH(entry, map) { vm_object_t obj, tobj, lobj; vm_offset_t addr; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_structsize = sizeof(*kve); kve->kve_private_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; } kve->kve_resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) kve->kve_resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) { VM_OBJECT_RLOCK(tobj); kve->kve_offset += tobj->backing_object_offset; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } kve->kve_start = (void*)entry->start; kve->kve_end = (void*)entry->end; kve->kve_offset += (off_t)entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; last_timestamp = map->timestamp; vm_map_unlock_read(map); kve->kve_fileid = 0; kve->kve_fsid = 0; freepath = NULL; fullpath = ""; if (lobj) { kve->kve_type = vm_object_kvme_type(lobj, &vp); if (kve->kve_type == KVME_TYPE_MGTDEVICE) kve->kve_type = KVME_TYPE_UNKNOWN; if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_fileid = va.va_fileid; /* truncate */ kve->kve_fsid = va.va_fsid; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); error = SYSCTL_OUT(req, kve, sizeof(*kve)); vm_map_lock_read(map); if (error) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } #endif /* COMPAT_FREEBSD7 */ #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif void kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t entry, int *resident_count, bool *super) { vm_object_t obj, tobj; vm_page_t m, m_adv; vm_offset_t addr; vm_paddr_t pa; vm_pindex_t pi, pi_adv, pindex; *super = false; *resident_count = 0; if (vmmap_skip_res_cnt) return; pa = 0; obj = entry->object.vm_object; addr = entry->start; m_adv = NULL; pi = OFF_TO_IDX(entry->offset); for (; addr < entry->end; addr += IDX_TO_OFF(pi_adv), pi += pi_adv) { if (m_adv != NULL) { m = m_adv; } else { pi_adv = atop(entry->end - addr); pindex = pi; for (tobj = obj;; tobj = tobj->backing_object) { m = vm_page_find_least(tobj, pindex); if (m != NULL) { if (m->pindex == pindex) break; if (pi_adv > m->pindex - pindex) { pi_adv = m->pindex - pindex; m_adv = m; } } if (tobj->backing_object == NULL) goto next; pindex += OFF_TO_IDX(tobj-> backing_object_offset); } } m_adv = NULL; if (m->psind != 0 && addr + pagesizes[1] <= entry->end && (addr & (pagesizes[1] - 1)) == 0 && (pmap_mincore(map->pmap, addr, &pa) & MINCORE_SUPER) != 0) { *super = true; pi_adv = atop(pagesizes[1]); } else { /* * We do not test the found page on validity. * Either the page is busy and being paged in, * or it was invalidated. The first case * should be counted as resident, the second * is not so clear; we do account both. */ pi_adv = 1; } *resident_count += pi_adv; next:; } } /* * Must be called with the process locked and will return unlocked. */ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { vm_map_entry_t entry, tmp_entry; struct vattr va; vm_map_t map; vm_object_t lobj, nobj, obj, tobj; char *fullpath, *freepath; struct kinfo_vmentry *kve; struct ucred *cred; struct vnode *vp; struct vmspace *vm; vm_offset_t addr; unsigned int last_timestamp; int error; bool guard, super; PROC_LOCK_ASSERT(p, MA_OWNED); _PHOLD(p); PROC_UNLOCK(p); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK | M_ZERO); error = 0; map = &vm->vm_map; vm_map_lock_read(map); VM_MAP_ENTRY_FOREACH(entry, map) { if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; addr = entry->end; bzero(kve, sizeof(*kve)); obj = entry->object.vm_object; if (obj != NULL) { if ((obj->flags & OBJ_ANON) != 0) kve->kve_obj = (uintptr_t)obj; for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); kve->kve_offset += tobj->backing_object_offset; lobj = tobj; } if (obj->backing_object == NULL) kve->kve_private_resident = obj->resident_page_count; kern_proc_vmmap_resident(map, entry, &kve->kve_resident, &super); if (super) kve->kve_flags |= KVME_FLAG_SUPER; for (tobj = obj; tobj != NULL; tobj = nobj) { nobj = tobj->backing_object; if (tobj != obj && tobj != lobj) VM_OBJECT_RUNLOCK(tobj); } } else { lobj = NULL; } kve->kve_start = entry->start; kve->kve_end = entry->end; kve->kve_offset += entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; if (entry->eflags & MAP_ENTRY_GROWS_UP) kve->kve_flags |= KVME_FLAG_GROWS_UP; if (entry->eflags & MAP_ENTRY_GROWS_DOWN) kve->kve_flags |= KVME_FLAG_GROWS_DOWN; if (entry->eflags & MAP_ENTRY_USER_WIRED) kve->kve_flags |= KVME_FLAG_USER_WIRED; guard = (entry->eflags & MAP_ENTRY_GUARD) != 0; last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = ""; if (lobj != NULL) { kve->kve_type = vm_object_kvme_type(lobj, &vp); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); kve->kve_vn_type = vntype_to_kinfo(vp->v_type); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_vn_fileid = va.va_fileid; kve->kve_vn_fsid = va.va_fsid; kve->kve_vn_fsid_freebsd11 = kve->kve_vn_fsid; /* truncate */ kve->kve_vn_mode = MAKEIMODE(va.va_type, va.va_mode); kve->kve_vn_size = va.va_size; kve->kve_vn_rdev = va.va_rdev; kve->kve_vn_rdev_freebsd11 = kve->kve_vn_rdev; /* truncate */ kve->kve_status = KF_ATTR_VALID; } vput(vp); } } else { kve->kve_type = guard ? KVME_TYPE_GUARD : KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ if ((flags & KERN_VMMAP_PACK_KINFO) != 0) kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) + strlen(kve->kve_path) + 1; else kve->kve_structsize = sizeof(*kve); kve->kve_structsize = roundup(kve->kve_structsize, sizeof(uint64_t)); /* Halt filling and truncate rather than exceeding maxlen */ if (maxlen != -1 && maxlen < kve->kve_structsize) { error = 0; vm_map_lock_read(map); break; } else if (maxlen != -1) maxlen -= kve->kve_structsize; if (sbuf_bcat(sb, kve, kve->kve_structsize) != 0) error = ENOMEM; vm_map_lock_read(map); if (error != 0) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } static int sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS) { struct proc *p; struct sbuf sb; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } error = kern_proc_vmmap_out(p, &sb, -1, KERN_VMMAP_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #if defined(STACK) || defined(DDB) static int sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS) { struct kinfo_kstack *kkstp; int error, i, *name, numthreads; lwpid_t *lwpidarray; struct thread *td; struct stack *st; struct sbuf sb; struct proc *p; u_int namelen; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p); if (error != 0) return (error); kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK); st = stack_create(M_WAITOK); lwpidarray = NULL; PROC_LOCK(p); do { if (lwpidarray != NULL) { free(lwpidarray, M_TEMP); lwpidarray = NULL; } numthreads = p->p_numthreads; PROC_UNLOCK(p); lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP, M_WAITOK | M_ZERO); PROC_LOCK(p); } while (numthreads < p->p_numthreads); /* * XXXRW: During the below loop, execve(2) and countless other sorts * of changes could have taken place. Should we check to see if the * vmspace has been replaced, or the like, in order to prevent * giving a snapshot that spans, say, execve(2), with some threads * before and some after? Among other things, the credentials could * have changed, in which case the right to extract debug info might * no longer be assured. */ i = 0; FOREACH_THREAD_IN_PROC(p, td) { KASSERT(i < numthreads, ("sysctl_kern_proc_kstack: numthreads")); lwpidarray[i] = td->td_tid; i++; } PROC_UNLOCK(p); numthreads = i; for (i = 0; i < numthreads; i++) { td = tdfind(lwpidarray[i], p->p_pid); if (td == NULL) { continue; } bzero(kkstp, sizeof(*kkstp)); (void)sbuf_new(&sb, kkstp->kkst_trace, sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN); thread_lock(td); kkstp->kkst_tid = td->td_tid; if (TD_IS_SWAPPED(td)) kkstp->kkst_state = KKST_STATE_SWAPPED; else if (stack_save_td(st, td) == 0) kkstp->kkst_state = KKST_STATE_STACKOK; else kkstp->kkst_state = KKST_STATE_RUNNING; thread_unlock(td); PROC_UNLOCK(p); stack_sbuf_print(&sb, st); sbuf_finish(&sb); sbuf_delete(&sb); error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp)); if (error) break; } PRELE(p); if (lwpidarray != NULL) free(lwpidarray, M_TEMP); stack_destroy(st); free(kkstp, M_TEMP); return (error); } #endif /* * This sysctl allows a process to retrieve the full list of groups from * itself or another process. */ static int sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct ucred *cred; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; PROC_LOCK(p); } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } cred = crhold(p->p_ucred); PROC_UNLOCK(p); error = SYSCTL_OUT(req, cred->cr_groups, cred->cr_ngroups * sizeof(gid_t)); crfree(cred); return (error); } /* * This sysctl allows a process to retrieve or/and set the resource limit for * another process. */ static int sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rlimit rlim; struct proc *p; u_int which; int flags, error; if (namelen != 2) return (EINVAL); which = (u_int)name[1]; if (which >= RLIM_NLIMITS) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(rlim)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); /* * Retrieve limit. */ if (req->oldptr != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); } error = SYSCTL_OUT(req, &rlim, sizeof(rlim)); if (error != 0) goto errout; /* * Set limit. */ if (req->newptr != NULL) { error = SYSCTL_IN(req, &rlim, sizeof(rlim)); if (error == 0) error = kern_proc_setrlimit(curthread, p, which, &rlim); } errout: PRELE(p); return (error); } /* * This sysctl allows a process to retrieve ps_strings structure location of * another process. */ static int sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; vm_offset_t ps_strings; int error; #ifdef COMPAT_FREEBSD32 uint32_t ps_strings32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { /* * We return 0 if the 32 bit emulation request is for a 64 bit * process. */ ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ? PTROUT(PROC_PS_STRINGS(p)) : 0; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32)); return (error); } #endif ps_strings = PROC_PS_STRINGS(p); PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings)); return (error); } /* * This sysctl allows a process to retrieve umask of another process. */ static int sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int error; u_short cmask; pid_t pid; if (namelen != 1) return (EINVAL); pid = (pid_t)name[0]; p = curproc; if (pid == p->p_pid || pid == 0) { cmask = p->p_pd->pd_cmask; goto out; } error = pget(pid, PGET_WANTREAD, &p); if (error != 0) return (error); cmask = p->p_pd->pd_cmask; PRELE(p); out: error = SYSCTL_OUT(req, &cmask, sizeof(cmask)); return (error); } /* * This sysctl allows a process to set and retrieve binary osreldate of * another process. */ static int sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, error, osrel; if (namelen != 1) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(osrel)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel)); if (error != 0) goto errout; if (req->newptr != NULL) { error = SYSCTL_IN(req, &osrel, sizeof(osrel)); if (error != 0) goto errout; if (osrel < 0) { error = EINVAL; goto errout; } p->p_osrel = osrel; } errout: PRELE(p); return (error); } static int sysctl_kern_proc_sigtramp(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct kinfo_sigtramp kst; const struct sysentvec *sv; int error; #ifdef COMPAT_FREEBSD32 struct kinfo_sigtramp32 kst32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); sv = p->p_sysent; #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { bzero(&kst32, sizeof(kst32)); if (SV_PROC_FLAG(p, SV_ILP32)) { if (PROC_HAS_SHP(p)) { kst32.ksigtramp_start = PROC_SIGCODE(p); kst32.ksigtramp_end = kst32.ksigtramp_start + ((sv->sv_flags & SV_DSO_SIG) == 0 ? *sv->sv_szsigcode : (uintptr_t)sv->sv_szsigcode); } else { kst32.ksigtramp_start = PROC_PS_STRINGS(p) - *sv->sv_szsigcode; kst32.ksigtramp_end = PROC_PS_STRINGS(p); } } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst32, sizeof(kst32)); return (error); } #endif bzero(&kst, sizeof(kst)); if (PROC_HAS_SHP(p)) { kst.ksigtramp_start = (char *)PROC_SIGCODE(p); kst.ksigtramp_end = (char *)kst.ksigtramp_start + ((sv->sv_flags & SV_DSO_SIG) == 0 ? *sv->sv_szsigcode : (uintptr_t)sv->sv_szsigcode); } else { kst.ksigtramp_start = (char *)PROC_PS_STRINGS(p) - *sv->sv_szsigcode; kst.ksigtramp_end = (char *)PROC_PS_STRINGS(p); } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst, sizeof(kst)); return (error); } static int sysctl_kern_proc_sigfastblk(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; pid_t pid; struct proc *p; struct thread *td1; uintptr_t addr; #ifdef COMPAT_FREEBSD32 uint32_t addr32; #endif int error; if (namelen != 1 || req->newptr != NULL) return (EINVAL); pid = (pid_t)name[0]; error = pget(pid, PGET_HOLD | PGET_NOTWEXIT | PGET_CANDEBUG, &p); if (error != 0) return (error); PROC_LOCK(p); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { if (!SV_PROC_FLAG(p, SV_ILP32)) { error = EINVAL; goto errlocked; } } #endif if (pid <= PID_MAX) { td1 = FIRST_THREAD_IN_PROC(p); } else { FOREACH_THREAD_IN_PROC(p, td1) { if (td1->td_tid == pid) break; } } if (td1 == NULL) { error = ESRCH; goto errlocked; } /* * The access to the private thread flags. It is fine as far * as no out-of-thin-air values are read from td_pflags, and * usermode read of the td_sigblock_ptr is racy inherently, * since target process might have already changed it * meantime. */ if ((td1->td_pflags & TDP_SIGFASTBLOCK) != 0) addr = (uintptr_t)td1->td_sigblock_ptr; else error = ENOTTY; errlocked: _PRELE(p); PROC_UNLOCK(p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { addr32 = addr; error = SYSCTL_OUT(req, &addr32, sizeof(addr32)); } else #endif error = SYSCTL_OUT(req, &addr, sizeof(addr)); return (error); } static int sysctl_kern_proc_vm_layout(SYSCTL_HANDLER_ARGS) { struct kinfo_vm_layout kvm; struct proc *p; struct vmspace *vmspace; int error, *name; name = (int *)arg1; if ((u_int)arg2 != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { if (!SV_PROC_FLAG(p, SV_ILP32)) { PROC_UNLOCK(p); return (EINVAL); } } #endif vmspace = vmspace_acquire_ref(p); PROC_UNLOCK(p); memset(&kvm, 0, sizeof(kvm)); kvm.kvm_min_user_addr = vm_map_min(&vmspace->vm_map); kvm.kvm_max_user_addr = vm_map_max(&vmspace->vm_map); kvm.kvm_text_addr = (uintptr_t)vmspace->vm_taddr; kvm.kvm_text_size = vmspace->vm_tsize; kvm.kvm_data_addr = (uintptr_t)vmspace->vm_daddr; kvm.kvm_data_size = vmspace->vm_dsize; kvm.kvm_stack_addr = (uintptr_t)vmspace->vm_maxsaddr; kvm.kvm_stack_size = vmspace->vm_ssize; kvm.kvm_shp_addr = vmspace->vm_shp_base; kvm.kvm_shp_size = p->p_sysent->sv_shared_page_len; if ((vmspace->vm_map.flags & MAP_WIREFUTURE) != 0) kvm.kvm_map_flags |= KMAP_FLAG_WIREFUTURE; if ((vmspace->vm_map.flags & MAP_ASLR) != 0) kvm.kvm_map_flags |= KMAP_FLAG_ASLR; if ((vmspace->vm_map.flags & MAP_ASLR_IGNSTART) != 0) kvm.kvm_map_flags |= KMAP_FLAG_ASLR_IGNSTART; if ((vmspace->vm_map.flags & MAP_WXORX) != 0) kvm.kvm_map_flags |= KMAP_FLAG_WXORX; if ((vmspace->vm_map.flags & MAP_ASLR_STACK) != 0) kvm.kvm_map_flags |= KMAP_FLAG_ASLR_STACK; if (vmspace->vm_shp_base != p->p_sysent->sv_shared_page_base && PROC_HAS_SHP(p)) kvm.kvm_map_flags |= KMAP_FLAG_ASLR_SHARED_PAGE; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct kinfo_vm_layout32 kvm32; memset(&kvm32, 0, sizeof(kvm32)); kvm32.kvm_min_user_addr = (uint32_t)kvm.kvm_min_user_addr; kvm32.kvm_max_user_addr = (uint32_t)kvm.kvm_max_user_addr; kvm32.kvm_text_addr = (uint32_t)kvm.kvm_text_addr; kvm32.kvm_text_size = (uint32_t)kvm.kvm_text_size; kvm32.kvm_data_addr = (uint32_t)kvm.kvm_data_addr; kvm32.kvm_data_size = (uint32_t)kvm.kvm_data_size; kvm32.kvm_stack_addr = (uint32_t)kvm.kvm_stack_addr; kvm32.kvm_stack_size = (uint32_t)kvm.kvm_stack_size; kvm32.kvm_shp_addr = (uint32_t)kvm.kvm_shp_addr; kvm32.kvm_shp_size = (uint32_t)kvm.kvm_shp_size; kvm32.kvm_map_flags = kvm.kvm_map_flags; vmspace_free(vmspace); error = SYSCTL_OUT(req, &kvm32, sizeof(kvm32)); goto out; } #endif error = SYSCTL_OUT(req, &kvm, sizeof(kvm)); #ifdef COMPAT_FREEBSD32 out: #endif vmspace_free(vmspace); return (error); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT| CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_CAPWR | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_env, "Process environment"); static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, including threads"); #ifdef COMPAT_FREEBSD7 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries"); #if defined(STACK) || defined(DDB) static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit, "Process resource limits"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings, "Process ps_strings location"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask"); static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel, "Process binary osreldate"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGTRAMP, sigtramp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sigtramp, "Process signal trampoline location"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGFASTBLK, sigfastblk, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_sigfastblk, "Thread sigfastblock address"); static SYSCTL_NODE(_kern_proc, KERN_PROC_VM_LAYOUT, vm_layout, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_vm_layout, "Process virtual address space layout info"); static struct sx stop_all_proc_blocker; SX_SYSINIT(stop_all_proc_blocker, &stop_all_proc_blocker, "sapblk"); bool stop_all_proc_block(void) { return (sx_xlock_sig(&stop_all_proc_blocker) == 0); } void stop_all_proc_unblock(void) { sx_xunlock(&stop_all_proc_blocker); } int allproc_gen; /* * stop_all_proc() purpose is to stop all process which have usermode, * except current process for obvious reasons. This makes it somewhat * unreliable when invoked from multithreaded process. The service * must not be user-callable anyway. */ void stop_all_proc(void) { struct proc *cp, *p; int r, gen; bool restart, seen_stopped, seen_exiting, stopped_some; if (!stop_all_proc_block()) return; cp = curproc; allproc_loop: sx_xlock(&allproc_lock); gen = allproc_gen; seen_exiting = seen_stopped = stopped_some = restart = false; LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) { PROC_UNLOCK(p); continue; } if ((p->p_flag2 & P2_WEXIT) != 0) { seen_exiting = true; PROC_UNLOCK(p); continue; } if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { /* * Stopped processes are tolerated when there * are no other processes which might continue * them. P_STOPPED_SINGLE but not * P_TOTAL_STOP process still has at least one * thread running. */ seen_stopped = true; PROC_UNLOCK(p); continue; } sx_xunlock(&allproc_lock); _PHOLD(p); r = thread_single(p, SINGLE_ALLPROC); if (r != 0) restart = true; else stopped_some = true; _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } /* Catch forked children we did not see in iteration. */ if (gen != allproc_gen) restart = true; sx_xunlock(&allproc_lock); if (restart || stopped_some || seen_exiting || seen_stopped) { kern_yield(PRI_USER); goto allproc_loop; } } void resume_all_proc(void) { struct proc *cp, *p; cp = curproc; sx_xlock(&allproc_lock); again: LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & P_TOTAL_STOP) != 0) { sx_xunlock(&allproc_lock); _PHOLD(p); thread_single_end(p, SINGLE_ALLPROC); _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } else { PROC_UNLOCK(p); } } /* Did the loop above missed any stopped process ? */ FOREACH_PROC_IN_SYSTEM(p) { /* No need for proc lock. */ if ((p->p_flag & P_TOTAL_STOP) != 0) goto again; } sx_xunlock(&allproc_lock); stop_all_proc_unblock(); } /* #define TOTAL_STOP_DEBUG 1 */ #ifdef TOTAL_STOP_DEBUG volatile static int ap_resume; #include static int sysctl_debug_stop_all_proc(SYSCTL_HANDLER_ARGS) { int error, val; val = 0; ap_resume = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0) { stop_all_proc(); syncer_suspend(); while (ap_resume == 0) ; syncer_resume(); resume_all_proc(); } return (0); } SYSCTL_PROC(_debug, OID_AUTO, stop_all_proc, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, __DEVOLATILE(int *, &ap_resume), 0, sysctl_debug_stop_all_proc, "I", ""); #endif diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index 8c2ec0e42d5d..87afc175a72d 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -1,1838 +1,1838 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_printf.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); static bool debugger_on_recursive_panic = false; SYSCTL_BOOL(_debug, OID_AUTO, debugger_on_recursive_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_recursive_panic, 0, "Run debugger on recursive kernel panic"); int debugger_on_trap = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_trap, 0, "Run debugger on kernel trap before panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; static bool trace_all_panics = true; #else static int trace_on_panic = 0; static bool trace_all_panics = false; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN, &trace_all_panics, 0, "Print stack traces on secondary kernel panics"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static bool poweroff_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN, &poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic"); static bool powercycle_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN, &powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, "Show busy buffers during shutdown"); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; union { struct { keyInstance aes_ki; cipherInstance aes_ci; } u_aes; struct chacha_ctx u_chacha; } u; #define kdc_ki u.u_aes.aes_ki #define kdc_ci u.u_aes.aes_ci #define kdc_chacha u.u_chacha uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif struct kerneldumpcomp { uint8_t kdc_format; struct compressor *kdc_stream; uint8_t *kdc_buf; size_t kdc_resid; }; static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression); static void kerneldumpcomp_destroy(struct dumperinfo *di); static int kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg); static int kerneldump_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN, &kerneldump_gzlevel, 0, "Kernel crash dump compression level"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; bool __read_frequently panicked; int __read_mostly dumping; /* system is dumping */ int rebooting; /* system is rebooting */ /* * Used to serialize between sysctl kern.shutdown.dumpdevname and list * modifications via ioctl. */ static struct mtx dumpconf_list_lk; MTX_SYSINIT(dumper_configs, &dumpconf_list_lk, "dumper config list", MTX_DEF); /* Our selected dumper(s). */ static TAILQ_HEAD(dumpconflist, dumperinfo) dumper_configs = TAILQ_HEAD_INITIALIZER(dumper_configs); /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); else kern_reboot(uap->opt); } return (error); } static void shutdown_nice_task_fn(void *arg, int pending __unused) { int howto; howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if ((howto & RB_POWEROFF) != 0) { BOOTTRACE("SIGUSR2 to init(8)"); kern_psignal(initproc, SIGUSR2); } else if ((howto & RB_POWERCYCLE) != 0) { BOOTTRACE("SIGWINCH to init(8)"); kern_psignal(initproc, SIGWINCH); } else if ((howto & RB_HALT) != 0) { BOOTTRACE("SIGUSR1 to init(8)"); kern_psignal(initproc, SIGUSR1); } else { BOOTTRACE("SIGINT to init(8)"); kern_psignal(initproc, SIGINT); } PROC_UNLOCK(initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, &shutdown_nice_task_fn, NULL); /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL && !SCHEDULER_STOPPED()) { BOOTTRACE("shutdown initiated"); shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } /* * Set up a context that can be extracted from the dump. */ void dump_savectx(void) { savectx(&dumppcb); dumptid = curthread->td_tid; } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (TAILQ_EMPTY(&dumper_configs)) return (ENXIO); dump_savectx(); dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(TAILQ_FIRST(&dumper_configs)); } #endif if (coredump) { struct dumperinfo *di; TAILQ_FOREACH(di, &dumper_configs, di_next) { error = dumpsys(di); if (error == 0) break; } } dumping--; return (error); } /* * Trace the shutdown reason. */ static void reboottrace(int howto) { if ((howto & RB_DUMP) != 0) { if ((howto & RB_HALT) != 0) BOOTTRACE("system panic: halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system panic: powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system panic: rebooting..."); } else { if ((howto & RB_HALT) != 0) BOOTTRACE("system halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system rebooting..."); } } /* * kern_reboot(9): Shut down the system cleanly to prepare for reboot, halt, or * power off. */ void kern_reboot(int howto) { static int once = 0; if (initproc != NULL && curproc != initproc) BOOTTRACE("kernel shutdown (dirty) started"); else BOOTTRACE("kernel shutdown (clean) started"); /* * Normal paths here don't hold Giant, but we can wind up here * unexpectedly with it held. Drop it now so we don't have to * drop and pick it up elsewhere. The paths it is locking will * never be returned to, and it is preferable to preclude * deadlock than to lock against code that won't ever * continue. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); #if defined(SMP) /* * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("%s: not running on cpu 0", __func__)); } #endif /* We're in the process of rebooting. */ rebooting = 1; reboottrace(howto); /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); BOOTTRACE("shutdown pre sync complete"); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; BOOTTRACE("bufshutdown begin"); bufshutdown(show_busybufs); BOOTTRACE("bufshutdown end"); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); BOOTTRACE("shutdown post sync complete"); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ BOOTTRACE("shutdown final begin"); if (shutdown_trace) boottrace_dump_console(); EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp); return (ENOENT); } if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); wdog_kern_pat(WD_TO_NEVER); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: break; } } } /* * Check to see if the system panicked, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "kassert options"); #ifdef KASSERT_PANIC_OPTIONAL #define KASSERT_RWTUN CTLFLAG_RWTUN #else #define KASSERT_RWTUN CTLFLAG_RDTUN #endif SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (0) or just a warning (1)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN, &kassert_do_log, 0, "If warn_only is enabled, log (1) or do not log (0) assertion violations"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN, &kassert_suppress_in_panic, 0, "KASSERTs will be suppressed while handling a panic"); #undef KASSERT_RWTUN static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } #ifdef KASSERT_PANIC_OPTIONAL /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * If we are suppressing secondary panics, log the warning but do not * re-enter panic/kdb. */ if (KERNEL_PANICKED() && kassert_suppress_in_panic) { if (kassert_do_log) { printf("KASSERT failed: %s\n", buf); #ifdef KDB if (trace_all_panics && trace_on_panic) kdb_backtrace(); #endif } return; } /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* KASSERT_PANIC_OPTIONAL */ #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } #endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; bootopt = RB_AUTOBOOT; newpanic = 0; if (KERNEL_PANICKED()) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; panicked = true; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif printf("time = %jd\n", (intmax_t )time_second); #ifdef KDB if ((newpanic || trace_all_panics) && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); else if (!newpanic && debugger_on_recursive_panic) kdb_enter(KDB_WHY_PANIC, "re-panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; if (poweroff_on_panic) bootopt |= RB_POWEROFF; if (powercycle_on_panic) bootopt |= RB_POWERCYCLE; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (KERNEL_PANICKED()) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (KERNEL_PANICKED()) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static int dumpdevname_sysctl_handler(SYSCTL_HANDLER_ARGS) { char buf[256]; struct dumperinfo *di; struct sbuf sb; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH(di, &dumper_configs, di_next) { if (di != TAILQ_FIRST(&dumper_configs)) sbuf_putc(&sb, ','); sbuf_cat(&sb, di->di_devname); } mtx_unlock(&dumpconf_list_lk); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_shutdown, OID_AUTO, dumpdevname, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, &dumper_configs, 0, dumpdevname_sysctl_handler, "A", "Device(s) for kernel dumps"); static int _dump_append(struct dumperinfo *di, void *virtual, size_t length); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; case KERNELDUMP_ENC_CHACHA20: chacha_keysetup(&kdc->kdc_chacha, key, 256); break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: zfree(kdc, M_EKCD); return (NULL); } static int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; case KERNELDUMP_ENC_CHACHA20: chacha_ivsetup(&kdc->kdc_chacha, kdc->kdc_iv, NULL); break; default: error = EINVAL; goto out; } kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); } static uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); } #endif /* EKCD */ static struct kerneldumpcomp * kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression) { struct kerneldumpcomp *kdcomp; int format; switch (compression) { case KERNELDUMP_COMP_GZIP: format = COMPRESS_GZIP; break; case KERNELDUMP_COMP_ZSTD: format = COMPRESS_ZSTD; break; default: return (NULL); } kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO); kdcomp->kdc_format = compression; kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb, format, di->maxiosize, kerneldump_gzlevel, di); if (kdcomp->kdc_stream == NULL) { free(kdcomp, M_DUMPER); return (NULL); } kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP); return (kdcomp); } static void kerneldumpcomp_destroy(struct dumperinfo *di) { struct kerneldumpcomp *kdcomp; kdcomp = di->kdcomp; if (kdcomp == NULL) return; compressor_fini(kdcomp->kdc_stream); zfree(kdcomp->kdc_buf, M_DUMPER); free(kdcomp, M_DUMPER); } /* * Free a dumper. Must not be present on global list. */ void dumper_destroy(struct dumperinfo *di) { if (di == NULL) return; zfree(di->blockbuf, M_DUMPER); kerneldumpcomp_destroy(di); #ifdef EKCD zfree(di->kdcrypto, M_EKCD); #endif zfree(di, M_DUMPER); } /* * Allocate and set up a new dumper from the provided template. */ int dumper_create(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda, struct dumperinfo **dip) { struct dumperinfo *newdi; int error = 0; if (dip == NULL) return (EINVAL); /* Allocate a new dumper */ newdi = malloc(sizeof(*newdi) + strlen(devname) + 1, M_DUMPER, M_WAITOK | M_ZERO); memcpy(newdi, di_template, sizeof(*newdi)); newdi->blockbuf = NULL; newdi->kdcrypto = NULL; newdi->kdcomp = NULL; strcpy(newdi->di_devname, devname); if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD newdi->kdcrypto = kerneldumpcrypto_create(newdi->blocksize, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, kda->kda_encryptedkey); if (newdi->kdcrypto == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } if (kda->kda_compression != KERNELDUMP_COMP_NONE) { #ifdef EKCD /* * We can't support simultaneous unpadded block cipher * encryption and compression because there is no guarantee the * length of the compressed result is exactly a multiple of the * cipher block size. */ if (kda->kda_encryption == KERNELDUMP_ENC_AES_256_CBC) { error = EOPNOTSUPP; goto cleanup; } #endif newdi->kdcomp = kerneldumpcomp_create(newdi, kda->kda_compression); if (newdi->kdcomp == NULL) { error = EINVAL; goto cleanup; } } newdi->blockbuf = malloc(newdi->blocksize, M_DUMPER, M_WAITOK | M_ZERO); *dip = newdi; return (0); cleanup: dumper_destroy(newdi); return (error); } /* * Create a new dumper and register it in the global list. */ int dumper_insert(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *newdi, *listdi; bool inserted; uint8_t index; int error; index = kda->kda_index; MPASS(index != KDA_REMOVE && index != KDA_REMOVE_DEV && index != KDA_REMOVE_ALL); error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); error = dumper_create(di_template, devname, kda, &newdi); if (error != 0) return (error); /* Add the new configuration to the queue */ mtx_lock(&dumpconf_list_lk); inserted = false; TAILQ_FOREACH(listdi, &dumper_configs, di_next) { if (index == 0) { TAILQ_INSERT_BEFORE(listdi, newdi, di_next); inserted = true; break; } index--; } if (!inserted) TAILQ_INSERT_TAIL(&dumper_configs, newdi, di_next); mtx_unlock(&dumpconf_list_lk); return (0); } #ifdef DDB void dumper_ddb_insert(struct dumperinfo *newdi) { TAILQ_INSERT_HEAD(&dumper_configs, newdi, di_next); } void dumper_ddb_remove(struct dumperinfo *di) { TAILQ_REMOVE(&dumper_configs, di, di_next); } #endif static bool dumper_config_match(const struct dumperinfo *di, const char *devname, const struct diocskerneldump_arg *kda) { if (kda->kda_index == KDA_REMOVE_ALL) return (true); if (strcmp(di->di_devname, devname) != 0) return (false); /* * Allow wildcard removal of configs matching a device on g_dev_orphan. */ if (kda->kda_index == KDA_REMOVE_DEV) return (true); if (di->kdcomp != NULL) { if (di->kdcomp->kdc_format != kda->kda_compression) return (false); } else if (kda->kda_compression != KERNELDUMP_COMP_NONE) return (false); #ifdef EKCD if (di->kdcrypto != NULL) { if (di->kdcrypto->kdc_encryption != kda->kda_encryption) return (false); /* * Do we care to verify keys match to delete? It seems weird * to expect multiple fallback dump configurations on the same * device that only differ in crypto key. */ } else #endif if (kda->kda_encryption != KERNELDUMP_ENC_NONE) return (false); return (true); } /* * Remove and free the requested dumper(s) from the global list. */ int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *di, *sdi; bool found; int error; error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); /* * Try to find a matching configuration, and kill it. * * NULL 'kda' indicates remove any configuration matching 'devname', * which may remove multiple configurations in atypical configurations. */ found = false; mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH_SAFE(di, &dumper_configs, di_next, sdi) { if (dumper_config_match(di, devname, kda)) { found = true; TAILQ_REMOVE(&dumper_configs, di, di_next); dumper_destroy(di); } } mtx_unlock(&dumpconf_list_lk); /* Only produce ENOENT if a more targeted match didn't match. */ if (!found && kda->kda_index == KDA_REMOVE) return (ENOENT); return (0); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { if (di->kdcomp != NULL && offset >= di->mediaoffset) { printf( "Compressed dump failed to fit in device boundaries.\n"); return (E2BIG); } printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } if (length % di->blocksize != 0) { printf("Attempt to write partial block of length %ju.\n", (uintmax_t)length); return (EINVAL); } if (offset % di->blocksize != 0) { printf("Attempt to write at unaligned offset %jd.\n", (intmax_t)offset); return (EINVAL); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; case KERNELDUMP_ENC_CHACHA20: chacha_encrypt_bytes(&kdc->kdc_chacha, buf, buf, size); break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int dump_encrypted_write(struct dumperinfo *di, void *virtual, off_t offset, size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; kdc = di->kdcrypto; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); error = dump_write(di, buf, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } return (0); } #endif /* EKCD */ static int kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg) { struct dumperinfo *di; size_t resid, rlength; int error; di = arg; if (length % di->blocksize != 0) { /* * This must be the final write after flushing the compression * stream. Write as many full blocks as possible and stash the * residual data in the dumper's block buffer. It will be * padded and written in dump_finish(). */ rlength = rounddown(length, di->blocksize); if (rlength != 0) { error = _dump_append(di, base, rlength); if (error != 0) return (error); } resid = length - rlength; memmove(di->blockbuf, (uint8_t *)base + rlength, resid); bzero((uint8_t *)di->blockbuf + resid, di->blocksize - resid); di->kdcomp->kdc_resid = resid; return (EAGAIN); } return (_dump_append(di, base, length)); } /* * Write kernel dump headers at the beginning and end of the dump extent. * Write the kernel dump encryption key after the leading header if we were * configured to do so. */ static int dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *buf; size_t hdrsz; uint64_t extent; uint32_t keysize; int error; hdrsz = sizeof(*kdh); if (hdrsz > di->blocksize) return (ENOMEM); #ifdef EKCD kdc = di->kdcrypto; keysize = kerneldumpcrypto_dumpkeysize(kdc); #else keysize = 0; #endif /* * If the dump device has special handling for headers, let it take care * of writing them out. */ if (di->dumper_hdr != NULL) return (di->dumper_hdr(di, kdh)); if (hdrsz == di->blocksize) buf = kdh; else { buf = di->blockbuf; memset(buf, 0, di->blocksize); memcpy(buf, kdh, hdrsz); } extent = dtoh64(kdh->dumpextent); #ifdef EKCD if (kdc != NULL) { error = dump_write(di, kdc->kdc_dumpkey, di->mediaoffset + di->mediasize - di->blocksize - extent - keysize, keysize); if (error != 0) return (error); } #endif error = dump_write(di, buf, di->mediaoffset + di->mediasize - 2 * di->blocksize - extent - keysize, di->blocksize); if (error == 0) error = dump_write(di, buf, di->mediaoffset + di->mediasize - di->blocksize, di->blocksize); return (error); } /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is to * protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64 * 1024) /* * Do some preliminary setup for a kernel dump: initialize state for encryption, * if requested, and make sure that we have enough space on the dump device. * * We set things up so that the dump ends before the last sector of the dump * device, at which the trailing header is written. * * +-----------+------+-----+----------------------------+------+ * | | lhdr | key | ... kernel dump ... | thdr | * +-----------+------+-----+----------------------------+------+ * 1 blk opt <------- dump extent --------> 1 blk * * Dumps written using dump_append() start at the beginning of the extent. * Uncompressed dumps will use the entire extent, but compressed dumps typically * will not. The true length of the dump is recorded in the leading and trailing * headers once the dump has been completed. * * The dump device may provide a callback, in which case it will initialize * dumpoff and take care of laying out the headers. */ int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *key; uint64_t dumpextent, span; uint32_t keysize; int error; #ifdef EKCD /* Send the key before the dump so a partial dump is still usable. */ kdc = di->kdcrypto; error = kerneldumpcrypto_init(kdc); if (error != 0) return (error); keysize = kerneldumpcrypto_dumpkeysize(kdc); key = keysize > 0 ? kdc->kdc_dumpkey : NULL; #else error = 0; keysize = 0; key = NULL; #endif if (di->dumper_start != NULL) { error = di->dumper_start(di, key, keysize); } else { dumpextent = dtoh64(kdh->dumpextent); span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize + keysize; if (di->mediasize < span) { if (di->kdcomp == NULL) return (E2BIG); /* * We don't yet know how much space the compressed dump * will occupy, so try to use the whole swap partition * (minus the first 64KB) in the hope that the * compressed dump will fit. If that doesn't turn out to * be enough, the bounds checking in dump_write() * will catch us and cause the dump to fail. */ dumpextent = di->mediasize - span + dumpextent; kdh->dumpextent = htod64(dumpextent); } /* * The offset at which to begin writing the dump. */ di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize - dumpextent; } di->origdumpoff = di->dumpoff; return (error); } static int _dump_append(struct dumperinfo *di, void *virtual, size_t length) { int error; #ifdef EKCD if (di->kdcrypto != NULL) error = dump_encrypted_write(di, virtual, di->dumpoff, length); else #endif error = dump_write(di, virtual, di->dumpoff, length); if (error == 0) di->dumpoff += length; return (error); } /* * Write to the dump device starting at dumpoff. When compression is enabled, * writes to the device will be performed using a callback that gets invoked * when the compression stream's output buffer is full. */ int dump_append(struct dumperinfo *di, void *virtual, size_t length) { void *buf; if (di->kdcomp != NULL) { /* Bounce through a buffer to avoid CRC errors. */ if (length > di->maxiosize) return (EINVAL); buf = di->kdcomp->kdc_buf; memmove(buf, virtual, length); return (compressor_write(di->kdcomp->kdc_stream, buf, length)); } return (_dump_append(di, virtual, length)); } /* * Write to the dump device at the specified offset. */ int dump_write(struct dumperinfo *di, void *virtual, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, offset, length)); } /* * Perform kernel dump finalization: flush the compression stream, if necessary, * write the leading and trailing kernel dump headers now that we know the true * length of the dump, and optionally write the encryption key following the * leading header. */ int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh) { int error; if (di->kdcomp != NULL) { error = compressor_flush(di->kdcomp->kdc_stream); if (error == EAGAIN) { /* We have residual data in di->blockbuf. */ error = _dump_append(di, di->blockbuf, di->blocksize); if (error == 0) /* Compensate for _dump_append()'s adjustment. */ di->dumpoff -= di->blocksize - di->kdcomp->kdc_resid; di->kdcomp->kdc_resid = 0; } if (error != 0) return (error); /* * We now know the size of the compressed dump, so update the * header accordingly and recompute parity. */ kdh->dumplength = htod64(di->dumpoff - di->origdumpoff); kdh->parity = 0; kdh->parity = kerneldump_parity(kdh); compressor_reset(di->kdcomp->kdc_stream); } error = dump_write_headers(di, kdh); if (error != 0) return (error); (void)dump_write(di, NULL, 0, 0); return (0); } void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, const char *magic, uint32_t archver, uint64_t dumplen) { size_t dstsize; bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumpextent = kdh->dumplength; kdh->dumptime = htod64(time_second); #ifdef EKCD kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto)); #else kdh->dumpkeysize = 0; #endif kdh->blocksize = htod32(di->blocksize); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); dstsize = sizeof(kdh->versionstring); if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize) kdh->versionstring[dstsize - 2] = '\n'; if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); if (di->kdcomp != NULL) kdh->compression = di->kdcomp->kdc_format; kdh->parity = kerneldump_parity(kdh); } #ifdef DDB -DB_SHOW_COMMAND(panic, db_show_panic) +DB_SHOW_COMMAND_FLAGS(panic, db_show_panic, DB_CMD_MEMSAFE) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index 5c6c49ec93bd..0ac0eca36da4 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -1,1554 +1,1554 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #include "opt_ddb.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif #ifdef SMP #include #endif DPCPU_DECLARE(sbintime_t, hardclocktime); SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *"); SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *"); static void softclock_thread(void *arg); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default)"); /* * TODO: * allocate more timeout table slots when table overflows. */ static u_int __read_mostly callwheelsize; static u_int __read_mostly callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_curr; callout_func_t *cc_drain; void *cc_last_func; void *cc_last_arg; #ifdef SMP callout_func_t *ce_migration_func; void *ce_migration_arg; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; int ce_migration_cpu; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; sbintime_t cc_firstevent; sbintime_t cc_lastscan; struct thread *cc_thread; u_int cc_bucket; #ifdef KTR char cc_ktr_event_name[20]; #endif }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr #define cc_exec_last_func(cc, dir) cc->cc_exec_entity[dir].cc_last_func #define cc_exec_last_arg(cc, dir) cc->cc_exec_entity[dir].cc_last_arg #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #define cc_exec_next(cc) cc->cc_next #define cc_exec_cancel(cc, dir) cc->cc_exec_entity[dir].cc_cancel #define cc_exec_waiting(cc, dir) cc->cc_exec_entity[dir].cc_waiting #ifdef SMP #define cc_migration_func(cc, dir) cc->cc_exec_entity[dir].ce_migration_func #define cc_migration_arg(cc, dir) cc->cc_exec_entity[dir].ce_migration_arg #define cc_migration_cpu(cc, dir) cc->cc_exec_entity[dir].ce_migration_cpu #define cc_migration_time(cc, dir) cc->cc_exec_entity[dir].ce_migration_time #define cc_migration_prec(cc, dir) cc->cc_exec_entity[dir].ce_migration_prec static struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else static struct callout_cpu cc_cpu; #define CC_CPU(cpu) (&cc_cpu) #define CC_SELF() (&cc_cpu) #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int __read_mostly cc_default_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock_call_cc() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc_exec_curr(cc, direct) = NULL; cc_exec_cancel(cc, direct) = false; cc_exec_waiting(cc, direct) = false; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc_migration_cpu(cc, direct) != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on the BSP during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; int cpu; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Initialize callout wheels. The software interrupt threads * are created later. */ cc_default_cpu = PCPU_GET(cpuid); CPU_FOREACH(cpu) { cc = CC_CPU(cpu); callout_cpu_init(cc, cpu); } } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN); cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * callwheelsize, M_CALLOUT, DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); #ifdef KTR snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); #endif } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start softclock threads. */ static void start_softclock(void *dummy) { struct proc *p; struct thread *td; struct callout_cpu *cc; int cpu, error; bool pin_swi; p = NULL; CPU_FOREACH(cpu) { cc = CC_CPU(cpu); error = kproc_kthread_add(softclock_thread, cc, &p, &td, RFSTOPPED, 0, "clock", "clock (%d)", cpu); if (error != 0) panic("failed to create softclock thread for cpu %d: %d", cpu, error); CC_LOCK(cc); cc->cc_thread = td; thread_lock(td); sched_class(td, PRI_ITHD); sched_ithread_prio(td, PI_SOFTCLOCK); TD_SET_IWAIT(td); thread_lock_set(td, (struct mtx *)&cc->cc_lock); thread_unlock(td); if (cpu == cc_default_cpu) pin_swi = pin_default_swi; else pin_swi = pin_pcpu_swi; if (pin_swi) { error = cpuset_setithread(td->td_tid, cpu); if (error != 0) printf("%s: %s clock couldn't be pinned to cpu %d: %d\n", __func__, cpu == cc_default_cpu ? "default" : "per-cpu", cpu, error); } } } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout_entropy { struct callout_cpu *cc; struct thread *td; sbintime_t now; } entropy; struct callout *c, *next; struct callout_cpu *cc; struct callout_list *sc; struct thread *td; sbintime_t first, last, lookahead, max, tmp_max; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = SBT_1S; first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; LIST_FOREACH_SAFE(c, sc, c_links.le, next) { /* Run the callout if present time within allowed. */ if (c->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (c->c_iflags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc_exec_next(cc) = next; cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(c, c_links.le); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); next = cc_exec_next(cc); cc_exec_next(cc) = NULL; } else { LIST_REMOVE(c, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, c, c_links.tqe); c->c_iflags |= CALLOUT_PROCESSED; } } else if (c->c_time >= max) { /* * Skip events in the distant future. */ ; } else if (c->c_time > last) { /* * Event minimal time is bigger than present * maximal time, so it cannot be aggregated. */ lastb = nowb; } else { /* * Update first and last time, respecting this * event. */ if (c->c_time < first) first = c->c_time; tmp_max = c->c_time + c->c_precision; if (tmp_max < last) last = tmp_max; } } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; cpu_new_callout(curcpu, last, first); #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif if (!TAILQ_EMPTY(&cc->cc_expireq)) { entropy.cc = cc; entropy.td = curthread; entropy.now = now; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_CALLOUT); td = cc->cc_thread; if (TD_AWAITING_INTR(td)) { thread_lock_block_wait(td); THREAD_LOCK_ASSERT(td, MA_OWNED); TD_CLR_IWAIT(td); sched_wakeup(td, SRQ_INTR); } else mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); } else mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; c->c_flags |= CALLOUT_ACTIVE; if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc_exec_next(cc) = c; /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(c->c_cpu, sbt, c->c_time); } } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; callout_func_t *c_func, *drain; void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_iflags; #ifdef SMP struct callout_cpu *new_cc; callout_func_t *new_func; void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static callout_func_t *lastfunc; #endif KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING, ("softclock_call_cc: pend %p %x", c, c->c_iflags)); KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE, ("softclock_call_cc: act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_iflags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_iflags = c->c_iflags; c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; cc_exec_last_func(cc, direct) = c_func; cc_exec_last_arg(cc, direct) = c_arg; cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc_exec_cancel(cc, direct)) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc_exec_cancel(cc, direct) = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE1(callout_execute, , , callout__start, c); c_func(c_arg); SDT_PROBE1(callout_execute, , , callout__end, c); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr")); cc_exec_curr(cc, direct) = NULL; if (cc_exec_drain(cc, direct)) { drain = cc_exec_drain(cc, direct); cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); drain(c_arg); CC_LOCK(cc); } if (cc_exec_waiting(cc, direct)) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; } cc_exec_waiting(cc, direct) = false; CC_UNLOCK(cc); wakeup(&cc_exec_waiting(cc, direct)); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc_migration_cpu(cc, direct); new_time = cc_migration_time(cc, direct); new_prec = cc_migration_prec(cc, direct); new_func = cc_migration_func(cc, direct); new_arg = cc_migration_arg(cc, direct); cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if (!callout_migrating(c)) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); return; } c->c_iflags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt thread handler. * Run periodic events from timeout queue. */ static void softclock_thread(void *arg) { struct thread *td = curthread; struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth, gcalls, lockcalls, mpcalls; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); for (;;) { while (TAILQ_EMPTY(&cc->cc_expireq)) { /* * Use CC_LOCK(cc) as the thread_lock while * idle. */ thread_lock(td); thread_lock_set(td, (struct mtx *)&cc->cc_lock); TD_SET_IWAIT(td); mi_switch(SW_VOL | SWT_IWAIT); /* mi_switch() drops thread_lock(). */ CC_LOCK(cc); } #ifdef CALLOUT_PROFILING depth = gcalls = lockcalls = mpcalls = 0; #endif while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif } } void callout_when(sbintime_t sbt, sbintime_t precision, int flags, sbintime_t *res, sbintime_t *prec_res) { sbintime_t to_sbt, to_pr; if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) { *res = sbt; *prec_res = precision; return; } if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt) sbt = tick_sbt; if ((flags & C_HARDCLOCK) != 0 || sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif if (cold && to_sbt == 0) to_sbt = sbinuptime(); if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; *res = to_sbt; to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); *prec_res = to_pr > precision ? to_pr : precision; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec, callout_func_t *ftn, void *arg, int cpu, int flags) { sbintime_t to_sbt, precision; struct callout_cpu *cc; int cancelled, direct; cancelled = 0; callout_when(sbt, prec, flags, &to_sbt, &precision); /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the * wrong direct flag if we don't do it before we add. */ if (flags & C_DIRECT_EXEC) { direct = 1; } else { direct = 0; } KASSERT(!direct || c->c_lock == NULL || (LOCK_CLASS(c->c_lock)->lc_flags & LC_SPINLOCK), ("%s: direct callout %p has non-spin lock", __func__, c)); cc = callout_lock(c); if (cpu == -1) cpu = c->c_cpu; KASSERT(cpu >= 0 && cpu <= mp_maxid && !CPU_ABSENT(cpu), ("%s: invalid cpu %d", __func__, cpu)); if (cc_exec_curr(cc, direct) == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) cancelled = cc_exec_cancel(cc, direct) = true; if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } #ifdef SMP if (callout_migrating(c)) { /* * This only occurs when a second callout_reset_sbt_on * is made after a previous one moved it into * deferred migration (below). Note we do *not* change * the prev_cpu even though the previous target may * be different. */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; cancelled = 1; CC_UNLOCK(cc); return (cancelled); } #endif } if (c->c_iflags & CALLOUT_PENDING) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } cancelled = 1; c->c_iflags &= ~ CALLOUT_PENDING; c->c_flags &= ~ CALLOUT_ACTIVE; } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc_exec_curr(cc, direct) == c) { /* * Pending will have been removed since we are * actually executing the callout on another * CPU. That callout should be waiting on the * lock the caller holds. If we set both * active/and/pending after we return and the * lock on the executing callout proceeds, it * will then see pending is true and return. * At the return from the actual callout execution * the migration will occur in softclock_call_cc * and this new callout will be placed on the * new CPU via a call to callout_cpu_switch() which * will get the lock on the right CPU followed * by a call callout_cc_add() which will add it there. * (see above in softclock_call_cc()). */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING); c->c_flags |= CALLOUT_ACTIVE; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int flags, callout_func_t *drain) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; int cancelled, not_on_a_list; if ((flags & CS_DRAIN) != 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, "calling %s", __func__); KASSERT((flags & CS_DRAIN) == 0 || drain == NULL, ("Cannot set drain callback and CS_DRAIN flag at the same time")); /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; if (c->c_iflags & CALLOUT_DIRECT) { direct = 1; } else { direct = 0; } sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) == (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) && ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) { /* * Special case where this slipped in while we * were migrating *as* the callout is about to * execute. The caller probably holds the lock * the callout wants. * * Get rid of the migration first. Then set * the flag that tells this code *not* to * try to remove it from any lists (its not * on one yet). When the callout wheel runs, * it will ignore this callout. */ c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; not_on_a_list = 1; } else { not_on_a_list = 0; } /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&cc_exec_waiting(old_cc, direct)); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout is running, try to stop it or drain it. */ if (cc_exec_curr(cc, direct) == c) { /* * Succeed we to stop it or not, we must clear the * active flag - this is what API users expect. If we're * draining and the callout is currently executing, first wait * until it finishes. */ if ((flags & CS_DRAIN) == 0) c->c_flags &= ~CALLOUT_ACTIVE; if ((flags & CS_DRAIN) != 0) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ if (cc_exec_curr(cc, direct) == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc_exec_waiting(cc, direct)); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc_exec_waiting(cc, direct) = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc_exec_waiting(cc, direct), &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc_exec_waiting(cc, direct), 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); goto again; } c->c_flags &= ~CALLOUT_ACTIVE; } else if (use_lock && !cc_exec_cancel(cc, direct) && (drain == NULL)) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). This *only* works with a * callout_stop() *not* callout_drain() or * callout_async_drain(). */ cc_exec_cancel(cc, direct) = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); if (callout_migrating(c)) { c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if (callout_migrating(c)) { /* * The callout is currently being serviced * and the "next" callout is scheduled at * its completion with a migration. We remove * the migration flag so it *won't* get rescheduled, * but we can't stop the one thats running so * we return 0. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP /* * We can't call cc_cce_cleanup here since * if we do it will remove .ce_curr and * its still running. This will prevent a * reschedule of the callout when the * execution completes. */ cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { KASSERT(cc_exec_drain(cc, direct) == NULL, ("callout drain function already set to %p", cc_exec_drain(cc, direct))); cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); return (0); } else { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { KASSERT(cc_exec_drain(cc, direct) == NULL, ("callout drain function already set to %p", cc_exec_drain(cc, direct))); cc_exec_drain(cc, direct) = drain; } } KASSERT(!sq_locked, ("sleepqueue chain still locked")); cancelled = 0; } else cancelled = 1; if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); if ((c->c_iflags & CALLOUT_PENDING) == 0) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); /* * For not scheduled and not executing callout return * negative value. */ if (cc_exec_curr(cc, direct) != c) cancelled = -1; CC_UNLOCK(cc); return (cancelled); } c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (not_on_a_list == 0) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } } CC_UNLOCK(cc); return (cancelled); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_iflags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_iflags = 0; } c->c_cpu = cc_default_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & LC_SLEEPABLE), ("%s: callout %p has sleepable lock", __func__, c)); c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = cc_default_cpu; } static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; int cpu; val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); CPU_FOREACH(cpu) { cc = CC_CPU(cpu); CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); } for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); #ifdef DDB static void _show_callout(struct callout *c) { db_printf("callout %p\n", c); #define C_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, c->e); db_printf(" &c_links = %p\n", &(c->c_links)); C_DB_PRINTF("%" PRId64, c_time); C_DB_PRINTF("%" PRId64, c_precision); C_DB_PRINTF("%p", c_arg); C_DB_PRINTF("%p", c_func); C_DB_PRINTF("%p", c_lock); C_DB_PRINTF("%#x", c_flags); C_DB_PRINTF("%#x", c_iflags); C_DB_PRINTF("%d", c_cpu); #undef C_DB_PRINTF } DB_SHOW_COMMAND(callout, db_show_callout) { if (!have_addr) { db_printf("usage: show callout \n"); return; } _show_callout((struct callout *)addr); } static void _show_last_callout(int cpu, int direct, const char *dirstr) { struct callout_cpu *cc; void *func, *arg; cc = CC_CPU(cpu); func = cc_exec_last_func(cc, direct); arg = cc_exec_last_arg(cc, direct); db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func); db_printsym((db_expr_t)func, DB_STGY_ANY); db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg); } -DB_SHOW_COMMAND(callout_last, db_show_callout_last) +DB_SHOW_COMMAND_FLAGS(callout_last, db_show_callout_last, DB_CMD_MEMSAFE) { int cpu, last; if (have_addr) { if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) { db_printf("no such cpu: %d\n", (int)addr); return; } cpu = last = addr; } else { cpu = 0; last = mp_maxid; } while (cpu <= last) { if (!CPU_ABSENT(cpu)) { _show_last_callout(cpu, 0, ""); _show_last_callout(cpu, 1, " direct"); } cpu++; } } #endif /* DDB */ diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index f87d99bbedcf..5c278afb61c2 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -1,320 +1,320 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93 * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include /* * Autoconfiguration subroutines. */ /* * "Interrupt driven config" functions. */ static STAILQ_HEAD(, intr_config_hook) intr_config_hook_list = STAILQ_HEAD_INITIALIZER(intr_config_hook_list); static struct intr_config_hook *next_to_notify; static struct mtx intr_config_hook_lock; MTX_SYSINIT(intr_config_hook, &intr_config_hook_lock, "intr config", MTX_DEF); /* ARGSUSED */ static void run_interrupt_driven_config_hooks(void); /* * Private data and a shim function for implementing config_interhook_oneshot(). */ struct oneshot_config_hook { struct intr_config_hook och_hook; /* Must be first */ ich_func_t och_func; void *och_arg; }; static void config_intrhook_oneshot_func(void *arg) { struct oneshot_config_hook *ohook; ohook = arg; ohook->och_func(ohook->och_arg); config_intrhook_disestablish(&ohook->och_hook); free(ohook, M_DEVBUF); } /* * If we wait too long for an interrupt-driven config hook to return, print * a diagnostic. */ #define WARNING_INTERVAL_SECS 60 static void run_interrupt_driven_config_hooks_warning(int warned) { struct intr_config_hook *hook_entry; char namebuf[64]; long offset; if (warned < 6) { printf("run_interrupt_driven_hooks: still waiting after %d " "seconds for", warned * WARNING_INTERVAL_SECS); STAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) { if (linker_search_symbol_name( (caddr_t)hook_entry->ich_func, namebuf, sizeof(namebuf), &offset) == 0) printf(" %s", namebuf); else printf(" %p", hook_entry->ich_func); } printf("\n"); } KASSERT(warned < 6, ("run_interrupt_driven_config_hooks: waited too long")); } static void run_interrupt_driven_config_hooks() { static int running; struct intr_config_hook *hook_entry; TSENTER(); mtx_lock(&intr_config_hook_lock); /* * If hook processing is already active, any newly * registered hooks will eventually be notified. * Let the currently running session issue these * notifications. */ if (running != 0) { mtx_unlock(&intr_config_hook_lock); return; } running = 1; while (next_to_notify != NULL) { hook_entry = next_to_notify; next_to_notify = STAILQ_NEXT(hook_entry, ich_links); hook_entry->ich_state = ICHS_RUNNING; mtx_unlock(&intr_config_hook_lock); (*hook_entry->ich_func)(hook_entry->ich_arg); mtx_lock(&intr_config_hook_lock); } running = 0; mtx_unlock(&intr_config_hook_lock); TSEXIT(); } static void boot_run_interrupt_driven_config_hooks(void *dummy) { int warned; run_interrupt_driven_config_hooks(); /* Block boot processing until all hooks are disestablished. */ TSWAIT("config hooks"); mtx_lock(&intr_config_hook_lock); warned = 0; while (!STAILQ_EMPTY(&intr_config_hook_list)) { if (msleep(&intr_config_hook_list, &intr_config_hook_lock, 0, "conifhk", WARNING_INTERVAL_SECS * hz) == EWOULDBLOCK) { mtx_unlock(&intr_config_hook_lock); warned++; run_interrupt_driven_config_hooks_warning(warned); mtx_lock(&intr_config_hook_lock); } } mtx_unlock(&intr_config_hook_lock); TSUNWAIT("config hooks"); } SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST, boot_run_interrupt_driven_config_hooks, NULL); /* * Register a hook that will be called after "cold" * autoconfiguration is complete and interrupts can * be used to complete initialization. */ int config_intrhook_establish(struct intr_config_hook *hook) { struct intr_config_hook *hook_entry; TSHOLD("config hooks"); mtx_lock(&intr_config_hook_lock); STAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) if (hook_entry == hook) break; if (hook_entry != NULL) { mtx_unlock(&intr_config_hook_lock); printf("config_intrhook_establish: establishing an " "already established hook.\n"); return (1); } STAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links); if (next_to_notify == NULL) next_to_notify = hook; hook->ich_state = ICHS_QUEUED; mtx_unlock(&intr_config_hook_lock); if (cold == 0) /* * XXX Call from a task since not all drivers expect * to be re-entered at the time a hook is established. */ /* XXX Sufficient for modules loaded after initial config??? */ run_interrupt_driven_config_hooks(); return (0); } /* * Register a hook function that is automatically unregistered after it runs. */ void config_intrhook_oneshot(ich_func_t func, void *arg) { struct oneshot_config_hook *ohook; ohook = malloc(sizeof(*ohook), M_DEVBUF, M_WAITOK); ohook->och_func = func; ohook->och_arg = arg; ohook->och_hook.ich_func = config_intrhook_oneshot_func; ohook->och_hook.ich_arg = ohook; config_intrhook_establish(&ohook->och_hook); } static void config_intrhook_disestablish_locked(struct intr_config_hook *hook) { struct intr_config_hook *hook_entry; STAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) if (hook_entry == hook) break; if (hook_entry == NULL) panic("config_intrhook_disestablish: disestablishing an " "unestablished hook"); if (next_to_notify == hook) next_to_notify = STAILQ_NEXT(hook, ich_links); STAILQ_REMOVE(&intr_config_hook_list, hook, intr_config_hook, ich_links); TSRELEASE("config hooks"); /* Wakeup anyone watching the list */ hook->ich_state = ICHS_DONE; wakeup(&intr_config_hook_list); } void config_intrhook_disestablish(struct intr_config_hook *hook) { mtx_lock(&intr_config_hook_lock); config_intrhook_disestablish_locked(hook); mtx_unlock(&intr_config_hook_lock); } int config_intrhook_drain(struct intr_config_hook *hook) { mtx_lock(&intr_config_hook_lock); /* * The config hook has completed, so just return. */ if (hook->ich_state == ICHS_DONE) { mtx_unlock(&intr_config_hook_lock); return (ICHS_DONE); } /* * The config hook hasn't started running, just call disestablish. */ if (hook->ich_state == ICHS_QUEUED) { config_intrhook_disestablish_locked(hook); mtx_unlock(&intr_config_hook_lock); return (ICHS_QUEUED); } /* * The config hook is running, so wait for it to complete and return. */ while (hook->ich_state != ICHS_DONE) { if (msleep(&intr_config_hook_list, &intr_config_hook_lock, 0, "confhd", hz) == EWOULDBLOCK) { // XXX do I whine? } } mtx_unlock(&intr_config_hook_lock); return (ICHS_RUNNING); } #ifdef DDB #include -DB_SHOW_COMMAND(conifhk, db_show_conifhk) +DB_SHOW_COMMAND_FLAGS(conifhk, db_show_conifhk, DB_CMD_MEMSAFE) { struct intr_config_hook *hook_entry; char namebuf[64]; long offset; STAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) { if (linker_ddb_search_symbol_name( (caddr_t)hook_entry->ich_func, namebuf, sizeof(namebuf), &offset) == 0) { db_printf("hook: %p at %s+%#lx arg: %p\n", hook_entry->ich_func, namebuf, offset, hook_entry->ich_arg); } else { db_printf("hook: %p at ??+?? arg %p\n", hook_entry->ich_func, hook_entry->ich_arg); } } } #endif /* DDB */ diff --git a/sys/kern/subr_devmap.c b/sys/kern/subr_devmap.c index 1c9b192cc4c8..f29e6df472b5 100644 --- a/sys/kern/subr_devmap.c +++ b/sys/kern/subr_devmap.c @@ -1,352 +1,352 @@ /*- * Copyright (c) 2013 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Routines for mapping device memory. */ #include "opt_ddb.h" #include #include #include #include #include #include #include static const struct devmap_entry *devmap_table; static boolean_t devmap_bootstrap_done = false; /* * The allocated-kva (akva) devmap table and metadata. Platforms can call * devmap_add_entry() to add static device mappings to this table using * automatically allocated virtual addresses carved out of the top of kva space. * Allocation begins immediately below the max kernel virtual address. */ #define AKVA_DEVMAP_MAX_ENTRIES 32 static struct devmap_entry akva_devmap_entries[AKVA_DEVMAP_MAX_ENTRIES]; static u_int akva_devmap_idx; static vm_offset_t akva_devmap_vaddr = DEVMAP_MAX_VADDR; #if defined(__aarch64__) || defined(__riscv) extern int early_boot; #endif /* * Print the contents of the static mapping table using the provided printf-like * output function (which will be either printf or db_printf). */ static void devmap_dump_table(int (*prfunc)(const char *, ...)) { const struct devmap_entry *pd; if (devmap_table == NULL || devmap_table[0].pd_size == 0) { prfunc("No static device mappings.\n"); return; } prfunc("Static device mappings:\n"); for (pd = devmap_table; pd->pd_size != 0; ++pd) { prfunc(" 0x%08jx - 0x%08jx mapped at VA 0x%08jx\n", (uintmax_t)pd->pd_pa, (uintmax_t)(pd->pd_pa + pd->pd_size - 1), (uintmax_t)pd->pd_va); } } /* * Print the contents of the static mapping table. Used for bootverbose. */ void devmap_print_table() { devmap_dump_table(printf); } /* * Return the "last" kva address used by the registered devmap table. It's * actually the lowest address used by the static mappings, i.e., the address of * the first unusable byte of KVA. */ vm_offset_t devmap_lastaddr() { const struct devmap_entry *pd; vm_offset_t lowaddr; if (akva_devmap_idx > 0) return (akva_devmap_vaddr); lowaddr = DEVMAP_MAX_VADDR; for (pd = devmap_table; pd != NULL && pd->pd_size != 0; ++pd) { if (lowaddr > pd->pd_va) lowaddr = pd->pd_va; } return (lowaddr); } /* * Add an entry to the internal "akva" static devmap table using the given * physical address and size and a virtual address allocated from the top of * kva. This automatically registers the akva table on the first call, so all a * platform has to do is call this routine to install as many mappings as it * needs and when the platform-specific init function calls devmap_bootstrap() * it will pick up all the entries in the akva table automatically. */ void devmap_add_entry(vm_paddr_t pa, vm_size_t sz) { struct devmap_entry *m; if (devmap_bootstrap_done) panic("devmap_add_entry() after devmap_bootstrap()"); if (akva_devmap_idx == (AKVA_DEVMAP_MAX_ENTRIES - 1)) panic("AKVA_DEVMAP_MAX_ENTRIES is too small"); if (akva_devmap_idx == 0) devmap_register_table(akva_devmap_entries); /* Allocate virtual address space from the top of kva downwards. */ #ifdef __arm__ /* * If the range being mapped is aligned and sized to 1MB boundaries then * also align the virtual address to the next-lower 1MB boundary so that * we end with a nice efficient section mapping. */ if ((pa & 0x000fffff) == 0 && (sz & 0x000fffff) == 0) { akva_devmap_vaddr = trunc_1mpage(akva_devmap_vaddr - sz); } else #endif { akva_devmap_vaddr = trunc_page(akva_devmap_vaddr - sz); } m = &akva_devmap_entries[akva_devmap_idx++]; m->pd_va = akva_devmap_vaddr; m->pd_pa = pa; m->pd_size = sz; } /* * Register the given table as the one to use in devmap_bootstrap(). */ void devmap_register_table(const struct devmap_entry *table) { devmap_table = table; } /* * Map all of the static regions in the devmap table, and remember the devmap * table so the mapdev, ptov, and vtop functions can do lookups later. * * If a non-NULL table pointer is given it is used unconditionally, otherwise * the previously-registered table is used. This smooths transition from legacy * code that fills in a local table then calls this function passing that table, * and newer code that uses devmap_register_table() in platform-specific * code, then lets the common platform-specific init function call this function * with a NULL pointer. */ void devmap_bootstrap(vm_offset_t l1pt, const struct devmap_entry *table) { const struct devmap_entry *pd; devmap_bootstrap_done = true; /* * If given a table pointer, use it. Otherwise, if a table was * previously registered, use it. Otherwise, no work to do. */ if (table != NULL) devmap_table = table; else if (devmap_table == NULL) return; for (pd = devmap_table; pd->pd_size != 0; ++pd) { #if defined(__arm__) #if __ARM_ARCH >= 6 pmap_preboot_map_attr(pd->pd_pa, pd->pd_va, pd->pd_size, VM_PROT_READ | VM_PROT_WRITE, VM_MEMATTR_DEVICE); #else pmap_map_chunk(l1pt, pd->pd_va, pd->pd_pa, pd->pd_size, VM_PROT_READ | VM_PROT_WRITE, PTE_DEVICE); #endif #elif defined(__aarch64__) || defined(__riscv) pmap_kenter_device(pd->pd_va, pd->pd_size, pd->pd_pa); #endif } } /* * Look up the given physical address in the static mapping data and return the * corresponding virtual address, or NULL if not found. */ void * devmap_ptov(vm_paddr_t pa, vm_size_t size) { const struct devmap_entry *pd; if (devmap_table == NULL) return (NULL); for (pd = devmap_table; pd->pd_size != 0; ++pd) { if (pa >= pd->pd_pa && pa + size <= pd->pd_pa + pd->pd_size) return ((void *)(pd->pd_va + (pa - pd->pd_pa))); } return (NULL); } /* * Look up the given virtual address in the static mapping data and return the * corresponding physical address, or DEVMAP_PADDR_NOTFOUND if not found. */ vm_paddr_t devmap_vtop(void * vpva, vm_size_t size) { const struct devmap_entry *pd; vm_offset_t va; if (devmap_table == NULL) return (DEVMAP_PADDR_NOTFOUND); va = (vm_offset_t)vpva; for (pd = devmap_table; pd->pd_size != 0; ++pd) { if (va >= pd->pd_va && va + size <= pd->pd_va + pd->pd_size) return ((vm_paddr_t)(pd->pd_pa + (va - pd->pd_va))); } return (DEVMAP_PADDR_NOTFOUND); } /* * Map a set of physical memory pages into the kernel virtual address space. * Return a pointer to where it is mapped. * * This uses a pre-established static mapping if one exists for the requested * range, otherwise it allocates kva space and maps the physical pages into it. * * This routine is intended to be used for mapping device memory, NOT real * memory; the mapping type is inherently VM_MEMATTR_DEVICE in * pmap_kenter_device(). */ void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { vm_offset_t va, offset; void * rva; /* First look in the static mapping table. */ if ((rva = devmap_ptov(pa, size)) != NULL) return (rva); offset = pa & PAGE_MASK; pa = trunc_page(pa); size = round_page(size + offset); #if defined(__aarch64__) || defined(__riscv) if (early_boot) { akva_devmap_vaddr = trunc_page(akva_devmap_vaddr - size); va = akva_devmap_vaddr; KASSERT(va >= VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE, ("Too many early devmap mappings")); } else #endif va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pmap_kenter_device(va, size, pa); return ((void *)(va + offset)); } #if defined(__aarch64__) || defined(__riscv) void * pmap_mapdev_attr(vm_offset_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, offset; void * rva; /* First look in the static mapping table. */ if ((rva = devmap_ptov(pa, size)) != NULL) return (rva); offset = pa & PAGE_MASK; pa = trunc_page(pa); size = round_page(size + offset); if (early_boot) { akva_devmap_vaddr = trunc_page(akva_devmap_vaddr - size); va = akva_devmap_vaddr; KASSERT(va >= (VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)), ("Too many early devmap mappings 2")); } else va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pmap_kenter(va, size, pa, ma); return ((void *)(va + offset)); } #endif /* * Unmap device memory and free the kva space. */ void pmap_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t offset; /* Nothing to do if we find the mapping in the static table. */ if (devmap_vtop((void*)va, size) != DEVMAP_PADDR_NOTFOUND) return; offset = va & PAGE_MASK; va = trunc_page(va); size = round_page(size + offset); pmap_kremove_device(va, size); kva_free(va, size); } #ifdef DDB #include -DB_SHOW_COMMAND(devmap, db_show_devmap) +DB_SHOW_COMMAND_FLAGS(devmap, db_show_devmap, DB_CMD_MEMSAFE) { devmap_dump_table(db_printf); } #endif /* DDB */ diff --git a/sys/kern/subr_intr.c b/sys/kern/subr_intr.c index 16306c78cbd9..17f4a24ab860 100644 --- a/sys/kern/subr_intr.c +++ b/sys/kern/subr_intr.c @@ -1,1743 +1,1743 @@ /*- * Copyright (c) 2015-2016 Svatopluk Kraus * Copyright (c) 2015-2016 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * New-style Interrupt Framework * * TODO: - add support for disconnected PICs. * - to support IPI (PPI) enabling on other CPUs if already started. * - to complete things for removable PICs. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_iommu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #ifdef DDB #include #endif #ifdef IOMMU #include #endif #include "pic_if.h" #include "msi_if.h" #define INTRNAME_LEN (2*MAXCOMLEN + 1) #ifdef DEBUG #define debugf(fmt, args...) do { printf("%s(): ", __func__); \ printf(fmt,##args); } while (0) #else #define debugf(fmt, args...) #endif MALLOC_DECLARE(M_INTRNG); MALLOC_DEFINE(M_INTRNG, "intr", "intr interrupt handling"); /* Main interrupt handler called from assembler -> 'hidden' for C code. */ void intr_irq_handler(struct trapframe *tf); /* Root interrupt controller stuff. */ device_t intr_irq_root_dev; static intr_irq_filter_t *irq_root_filter; static void *irq_root_arg; static u_int irq_root_ipicount; struct intr_pic_child { SLIST_ENTRY(intr_pic_child) pc_next; struct intr_pic *pc_pic; intr_child_irq_filter_t *pc_filter; void *pc_filter_arg; uintptr_t pc_start; uintptr_t pc_length; }; /* Interrupt controller definition. */ struct intr_pic { SLIST_ENTRY(intr_pic) pic_next; intptr_t pic_xref; /* hardware identification */ device_t pic_dev; /* Only one of FLAG_PIC or FLAG_MSI may be set */ #define FLAG_PIC (1 << 0) #define FLAG_MSI (1 << 1) #define FLAG_TYPE_MASK (FLAG_PIC | FLAG_MSI) u_int pic_flags; struct mtx pic_child_lock; SLIST_HEAD(, intr_pic_child) pic_children; }; static struct mtx pic_list_lock; static SLIST_HEAD(, intr_pic) pic_list; static struct intr_pic *pic_lookup(device_t dev, intptr_t xref, int flags); /* Interrupt source definition. */ static struct mtx isrc_table_lock; static struct intr_irqsrc **irq_sources; u_int irq_next_free; #ifdef SMP #ifdef EARLY_AP_STARTUP static bool irq_assign_cpu = true; #else static bool irq_assign_cpu = false; #endif #endif u_int intr_nirq = NIRQ; SYSCTL_UINT(_machdep, OID_AUTO, nirq, CTLFLAG_RDTUN, &intr_nirq, 0, "Number of IRQs"); /* Data for MI statistics reporting. */ u_long *intrcnt; char *intrnames; size_t sintrcnt; size_t sintrnames; static u_int intrcnt_index; static struct intr_irqsrc *intr_map_get_isrc(u_int res_id); static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc); static struct intr_map_data * intr_map_get_map_data(u_int res_id); static void intr_map_copy_map_data(u_int res_id, device_t *dev, intptr_t *xref, struct intr_map_data **data); /* * Interrupt framework initialization routine. */ static void intr_irq_init(void *dummy __unused) { u_int intrcnt_count; SLIST_INIT(&pic_list); mtx_init(&pic_list_lock, "intr pic list", NULL, MTX_DEF); mtx_init(&isrc_table_lock, "intr isrc table", NULL, MTX_DEF); /* * - 2 counters for each I/O interrupt. * - MAXCPU counters for each IPI counters for SMP. */ intrcnt_count = intr_nirq * 2; #ifdef SMP intrcnt_count += INTR_IPI_COUNT * MAXCPU; #endif intrcnt = mallocarray(intrcnt_count, sizeof(u_long), M_INTRNG, M_WAITOK | M_ZERO); intrnames = mallocarray(intrcnt_count, INTRNAME_LEN, M_INTRNG, M_WAITOK | M_ZERO); sintrcnt = intrcnt_count * sizeof(u_long); sintrnames = intrcnt_count * INTRNAME_LEN; irq_sources = mallocarray(intr_nirq, sizeof(struct intr_irqsrc*), M_INTRNG, M_WAITOK | M_ZERO); } SYSINIT(intr_irq_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_irq_init, NULL); static void intrcnt_setname(const char *name, int index) { snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s", INTRNAME_LEN - 1, name); } /* * Update name for interrupt source with interrupt event. */ static void intrcnt_updatename(struct intr_irqsrc *isrc) { /* QQQ: What about stray counter name? */ mtx_assert(&isrc_table_lock, MA_OWNED); intrcnt_setname(isrc->isrc_event->ie_fullname, isrc->isrc_index); } /* * Virtualization for interrupt source interrupt counter increment. */ static inline void isrc_increment_count(struct intr_irqsrc *isrc) { if (isrc->isrc_flags & INTR_ISRCF_PPI) atomic_add_long(&isrc->isrc_count[0], 1); else isrc->isrc_count[0]++; } /* * Virtualization for interrupt source interrupt stray counter increment. */ static inline void isrc_increment_straycount(struct intr_irqsrc *isrc) { isrc->isrc_count[1]++; } /* * Virtualization for interrupt source interrupt name update. */ static void isrc_update_name(struct intr_irqsrc *isrc, const char *name) { char str[INTRNAME_LEN]; mtx_assert(&isrc_table_lock, MA_OWNED); if (name != NULL) { snprintf(str, INTRNAME_LEN, "%s: %s", isrc->isrc_name, name); intrcnt_setname(str, isrc->isrc_index); snprintf(str, INTRNAME_LEN, "stray %s: %s", isrc->isrc_name, name); intrcnt_setname(str, isrc->isrc_index + 1); } else { snprintf(str, INTRNAME_LEN, "%s:", isrc->isrc_name); intrcnt_setname(str, isrc->isrc_index); snprintf(str, INTRNAME_LEN, "stray %s:", isrc->isrc_name); intrcnt_setname(str, isrc->isrc_index + 1); } } /* * Virtualization for interrupt source interrupt counters setup. */ static void isrc_setup_counters(struct intr_irqsrc *isrc) { u_int index; /* * XXX - it does not work well with removable controllers and * interrupt sources !!! */ index = atomic_fetchadd_int(&intrcnt_index, 2); isrc->isrc_index = index; isrc->isrc_count = &intrcnt[index]; isrc_update_name(isrc, NULL); } /* * Virtualization for interrupt source interrupt counters release. */ static void isrc_release_counters(struct intr_irqsrc *isrc) { panic("%s: not implemented", __func__); } #ifdef SMP /* * Virtualization for interrupt source IPI counters setup. */ u_long * intr_ipi_setup_counters(const char *name) { u_int index, i; char str[INTRNAME_LEN]; index = atomic_fetchadd_int(&intrcnt_index, MAXCPU); for (i = 0; i < MAXCPU; i++) { snprintf(str, INTRNAME_LEN, "cpu%d:%s", i, name); intrcnt_setname(str, index + i); } return (&intrcnt[index]); } #endif /* * Main interrupt dispatch handler. It's called straight * from the assembler, where CPU interrupt is served. */ void intr_irq_handler(struct trapframe *tf) { struct trapframe * oldframe; struct thread * td; KASSERT(irq_root_filter != NULL, ("%s: no filter", __func__)); VM_CNT_INC(v_intr); critical_enter(); td = curthread; oldframe = td->td_intr_frame; td->td_intr_frame = tf; irq_root_filter(irq_root_arg); td->td_intr_frame = oldframe; critical_exit(); #ifdef HWPMC_HOOKS if (pmc_hook && TRAPF_USERMODE(tf) && (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN)) pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, tf); #endif } int intr_child_irq_handler(struct intr_pic *parent, uintptr_t irq) { struct intr_pic_child *child; bool found; found = false; mtx_lock_spin(&parent->pic_child_lock); SLIST_FOREACH(child, &parent->pic_children, pc_next) { if (child->pc_start <= irq && irq < (child->pc_start + child->pc_length)) { found = true; break; } } mtx_unlock_spin(&parent->pic_child_lock); if (found) return (child->pc_filter(child->pc_filter_arg, irq)); return (FILTER_STRAY); } /* * interrupt controller dispatch function for interrupts. It should * be called straight from the interrupt controller, when associated interrupt * source is learned. */ int intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) { KASSERT(isrc != NULL, ("%s: no source", __func__)); isrc_increment_count(isrc); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { int error; error = isrc->isrc_filter(isrc->isrc_arg, tf); PIC_POST_FILTER(isrc->isrc_dev, isrc); if (error == FILTER_HANDLED) return (0); } else #endif if (isrc->isrc_event != NULL) { if (intr_event_handle(isrc->isrc_event, tf) == 0) return (0); } isrc_increment_straycount(isrc); return (EINVAL); } /* * Alloc unique interrupt number (resource handle) for interrupt source. * * There could be various strategies how to allocate free interrupt number * (resource handle) for new interrupt source. * * 1. Handles are always allocated forward, so handles are not recycled * immediately. However, if only one free handle left which is reused * constantly... */ static inline int isrc_alloc_irq(struct intr_irqsrc *isrc) { u_int irq; mtx_assert(&isrc_table_lock, MA_OWNED); if (irq_next_free >= intr_nirq) return (ENOSPC); for (irq = irq_next_free; irq < intr_nirq; irq++) { if (irq_sources[irq] == NULL) goto found; } for (irq = 0; irq < irq_next_free; irq++) { if (irq_sources[irq] == NULL) goto found; } irq_next_free = intr_nirq; return (ENOSPC); found: isrc->isrc_irq = irq; irq_sources[irq] = isrc; irq_next_free = irq + 1; if (irq_next_free >= intr_nirq) irq_next_free = 0; return (0); } /* * Free unique interrupt number (resource handle) from interrupt source. */ static inline int isrc_free_irq(struct intr_irqsrc *isrc) { mtx_assert(&isrc_table_lock, MA_OWNED); if (isrc->isrc_irq >= intr_nirq) return (EINVAL); if (irq_sources[isrc->isrc_irq] != isrc) return (EINVAL); irq_sources[isrc->isrc_irq] = NULL; isrc->isrc_irq = INTR_IRQ_INVALID; /* just to be safe */ /* * If we are recovering from the state irq_sources table is full, * then the following allocation should check the entire table. This * will ensure maximum separation of allocation order from release * order. */ if (irq_next_free >= intr_nirq) irq_next_free = 0; return (0); } /* * Initialize interrupt source and register it into global interrupt table. */ int intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags, const char *fmt, ...) { int error; va_list ap; bzero(isrc, sizeof(struct intr_irqsrc)); isrc->isrc_dev = dev; isrc->isrc_irq = INTR_IRQ_INVALID; /* just to be safe */ isrc->isrc_flags = flags; va_start(ap, fmt); vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap); va_end(ap); mtx_lock(&isrc_table_lock); error = isrc_alloc_irq(isrc); if (error != 0) { mtx_unlock(&isrc_table_lock); return (error); } /* * Setup interrupt counters, but not for IPI sources. Those are setup * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust * our counter pool. */ if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) isrc_setup_counters(isrc); mtx_unlock(&isrc_table_lock); return (0); } /* * Deregister interrupt source from global interrupt table. */ int intr_isrc_deregister(struct intr_irqsrc *isrc) { int error; mtx_lock(&isrc_table_lock); if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) isrc_release_counters(isrc); error = isrc_free_irq(isrc); mtx_unlock(&isrc_table_lock); return (error); } #ifdef SMP /* * A support function for a PIC to decide if provided ISRC should be inited * on given cpu. The logic of INTR_ISRCF_BOUND flag and isrc_cpu member of * struct intr_irqsrc is the following: * * If INTR_ISRCF_BOUND is set, the ISRC should be inited only on cpus * set in isrc_cpu. If not, the ISRC should be inited on every cpu and * isrc_cpu is kept consistent with it. Thus isrc_cpu is always correct. */ bool intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu) { if (isrc->isrc_handlers == 0) return (false); if ((isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) == 0) return (false); if (isrc->isrc_flags & INTR_ISRCF_BOUND) return (CPU_ISSET(cpu, &isrc->isrc_cpu)); CPU_SET(cpu, &isrc->isrc_cpu); return (true); } #endif #ifdef INTR_SOLO /* * Setup filter into interrupt source. */ static int iscr_setup_filter(struct intr_irqsrc *isrc, const char *name, intr_irq_filter_t *filter, void *arg, void **cookiep) { if (filter == NULL) return (EINVAL); mtx_lock(&isrc_table_lock); /* * Make sure that we do not mix the two ways * how we handle interrupt sources. */ if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) { mtx_unlock(&isrc_table_lock); return (EBUSY); } isrc->isrc_filter = filter; isrc->isrc_arg = arg; isrc_update_name(isrc, name); mtx_unlock(&isrc_table_lock); *cookiep = isrc; return (0); } #endif /* * Interrupt source pre_ithread method for MI interrupt framework. */ static void intr_isrc_pre_ithread(void *arg) { struct intr_irqsrc *isrc = arg; PIC_PRE_ITHREAD(isrc->isrc_dev, isrc); } /* * Interrupt source post_ithread method for MI interrupt framework. */ static void intr_isrc_post_ithread(void *arg) { struct intr_irqsrc *isrc = arg; PIC_POST_ITHREAD(isrc->isrc_dev, isrc); } /* * Interrupt source post_filter method for MI interrupt framework. */ static void intr_isrc_post_filter(void *arg) { struct intr_irqsrc *isrc = arg; PIC_POST_FILTER(isrc->isrc_dev, isrc); } /* * Interrupt source assign_cpu method for MI interrupt framework. */ static int intr_isrc_assign_cpu(void *arg, int cpu) { #ifdef SMP struct intr_irqsrc *isrc = arg; int error; mtx_lock(&isrc_table_lock); if (cpu == NOCPU) { CPU_ZERO(&isrc->isrc_cpu); isrc->isrc_flags &= ~INTR_ISRCF_BOUND; } else { CPU_SETOF(cpu, &isrc->isrc_cpu); isrc->isrc_flags |= INTR_ISRCF_BOUND; } /* * In NOCPU case, it's up to PIC to either leave ISRC on same CPU or * re-balance it to another CPU or enable it on more CPUs. However, * PIC is expected to change isrc_cpu appropriately to keep us well * informed if the call is successful. */ if (irq_assign_cpu) { error = PIC_BIND_INTR(isrc->isrc_dev, isrc); if (error) { CPU_ZERO(&isrc->isrc_cpu); mtx_unlock(&isrc_table_lock); return (error); } } mtx_unlock(&isrc_table_lock); return (0); #else return (EOPNOTSUPP); #endif } /* * Create interrupt event for interrupt source. */ static int isrc_event_create(struct intr_irqsrc *isrc) { struct intr_event *ie; int error; error = intr_event_create(&ie, isrc, 0, isrc->isrc_irq, intr_isrc_pre_ithread, intr_isrc_post_ithread, intr_isrc_post_filter, intr_isrc_assign_cpu, "%s:", isrc->isrc_name); if (error) return (error); mtx_lock(&isrc_table_lock); /* * Make sure that we do not mix the two ways * how we handle interrupt sources. Let contested event wins. */ #ifdef INTR_SOLO if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) { #else if (isrc->isrc_event != NULL) { #endif mtx_unlock(&isrc_table_lock); intr_event_destroy(ie); return (isrc->isrc_event != NULL ? EBUSY : 0); } isrc->isrc_event = ie; mtx_unlock(&isrc_table_lock); return (0); } #ifdef notyet /* * Destroy interrupt event for interrupt source. */ static void isrc_event_destroy(struct intr_irqsrc *isrc) { struct intr_event *ie; mtx_lock(&isrc_table_lock); ie = isrc->isrc_event; isrc->isrc_event = NULL; mtx_unlock(&isrc_table_lock); if (ie != NULL) intr_event_destroy(ie); } #endif /* * Add handler to interrupt source. */ static int isrc_add_handler(struct intr_irqsrc *isrc, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) { int error; if (isrc->isrc_event == NULL) { error = isrc_event_create(isrc); if (error) return (error); } error = intr_event_add_handler(isrc->isrc_event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); if (error == 0) { mtx_lock(&isrc_table_lock); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } /* * Lookup interrupt controller locked. */ static inline struct intr_pic * pic_lookup_locked(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_assert(&pic_list_lock, MA_OWNED); if (dev == NULL && xref == 0) return (NULL); /* Note that pic->pic_dev is never NULL on registered PIC. */ SLIST_FOREACH(pic, &pic_list, pic_next) { if ((pic->pic_flags & FLAG_TYPE_MASK) != (flags & FLAG_TYPE_MASK)) continue; if (dev == NULL) { if (xref == pic->pic_xref) return (pic); } else if (xref == 0 || pic->pic_xref == 0) { if (dev == pic->pic_dev) return (pic); } else if (xref == pic->pic_xref && dev == pic->pic_dev) return (pic); } return (NULL); } /* * Lookup interrupt controller. */ static struct intr_pic * pic_lookup(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); mtx_unlock(&pic_list_lock); return (pic); } /* * Create interrupt controller. */ static struct intr_pic * pic_create(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); if (pic != NULL) { mtx_unlock(&pic_list_lock); return (pic); } pic = malloc(sizeof(*pic), M_INTRNG, M_NOWAIT | M_ZERO); if (pic == NULL) { mtx_unlock(&pic_list_lock); return (NULL); } pic->pic_xref = xref; pic->pic_dev = dev; pic->pic_flags = flags; mtx_init(&pic->pic_child_lock, "pic child lock", NULL, MTX_SPIN); SLIST_INSERT_HEAD(&pic_list, pic, pic_next); mtx_unlock(&pic_list_lock); return (pic); } #ifdef notyet /* * Destroy interrupt controller. */ static void pic_destroy(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); if (pic == NULL) { mtx_unlock(&pic_list_lock); return; } SLIST_REMOVE(&pic_list, pic, intr_pic, pic_next); mtx_unlock(&pic_list_lock); free(pic, M_INTRNG); } #endif /* * Register interrupt controller. */ struct intr_pic * intr_pic_register(device_t dev, intptr_t xref) { struct intr_pic *pic; if (dev == NULL) return (NULL); pic = pic_create(dev, xref, FLAG_PIC); if (pic == NULL) return (NULL); debugf("PIC %p registered for %s \n", pic, device_get_nameunit(dev), dev, (uintmax_t)xref); return (pic); } /* * Unregister interrupt controller. */ int intr_pic_deregister(device_t dev, intptr_t xref) { panic("%s: not implemented", __func__); } /* * Mark interrupt controller (itself) as a root one. * * Note that only an interrupt controller can really know its position * in interrupt controller's tree. So root PIC must claim itself as a root. * * In FDT case, according to ePAPR approved version 1.1 from 08 April 2011, * page 30: * "The root of the interrupt tree is determined when traversal * of the interrupt tree reaches an interrupt controller node without * an interrupts property and thus no explicit interrupt parent." */ int intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter, void *arg, u_int ipicount) { struct intr_pic *pic; pic = pic_lookup(dev, xref, FLAG_PIC); if (pic == NULL) { device_printf(dev, "not registered\n"); return (EINVAL); } KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC, ("%s: Found a non-PIC controller: %s", __func__, device_get_name(pic->pic_dev))); if (filter == NULL) { device_printf(dev, "filter missing\n"); return (EINVAL); } /* * Only one interrupt controllers could be on the root for now. * Note that we further suppose that there is not threaded interrupt * routine (handler) on the root. See intr_irq_handler(). */ if (intr_irq_root_dev != NULL) { device_printf(dev, "another root already set\n"); return (EBUSY); } intr_irq_root_dev = dev; irq_root_filter = filter; irq_root_arg = arg; irq_root_ipicount = ipicount; debugf("irq root set to %s\n", device_get_nameunit(dev)); return (0); } /* * Add a handler to manage a sub range of a parents interrupts. */ int intr_pic_add_handler(device_t parent, struct intr_pic *pic, intr_child_irq_filter_t *filter, void *arg, uintptr_t start, uintptr_t length) { struct intr_pic *parent_pic; struct intr_pic_child *newchild; #ifdef INVARIANTS struct intr_pic_child *child; #endif /* Find the parent PIC */ parent_pic = pic_lookup(parent, 0, FLAG_PIC); if (parent_pic == NULL) return (ENXIO); newchild = malloc(sizeof(*newchild), M_INTRNG, M_WAITOK | M_ZERO); newchild->pc_pic = pic; newchild->pc_filter = filter; newchild->pc_filter_arg = arg; newchild->pc_start = start; newchild->pc_length = length; mtx_lock_spin(&parent_pic->pic_child_lock); #ifdef INVARIANTS SLIST_FOREACH(child, &parent_pic->pic_children, pc_next) { KASSERT(child->pc_pic != pic, ("%s: Adding a child PIC twice", __func__)); } #endif SLIST_INSERT_HEAD(&parent_pic->pic_children, newchild, pc_next); mtx_unlock_spin(&parent_pic->pic_child_lock); return (0); } static int intr_resolve_irq(device_t dev, intptr_t xref, struct intr_map_data *data, struct intr_irqsrc **isrc) { struct intr_pic *pic; struct intr_map_data_msi *msi; if (data == NULL) return (EINVAL); pic = pic_lookup(dev, xref, (data->type == INTR_MAP_DATA_MSI) ? FLAG_MSI : FLAG_PIC); if (pic == NULL) return (ESRCH); switch (data->type) { case INTR_MAP_DATA_MSI: KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); msi = (struct intr_map_data_msi *)data; *isrc = msi->isrc; return (0); default: KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC, ("%s: Found a non-PIC controller: %s", __func__, device_get_name(pic->pic_dev))); return (PIC_MAP_INTR(pic->pic_dev, data, isrc)); } } bool intr_is_per_cpu(struct resource *res) { u_int res_id; struct intr_irqsrc *isrc; res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL) panic("Attempt to get isrc for non-active resource id: %u\n", res_id); return ((isrc->isrc_flags & INTR_ISRCF_PPI) != 0); } int intr_activate_irq(device_t dev, struct resource *res) { device_t map_dev; intptr_t map_xref; struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; int error; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); if (intr_map_get_isrc(res_id) != NULL) panic("Attempt to double activation of resource id: %u\n", res_id); intr_map_copy_map_data(res_id, &map_dev, &map_xref, &data); error = intr_resolve_irq(map_dev, map_xref, data, &isrc); if (error != 0) { free(data, M_INTRNG); /* XXX TODO DISCONECTED PICs */ /* if (error == EINVAL) return(0); */ return (error); } intr_map_set_isrc(res_id, isrc); rman_set_virtual(res, data); return (PIC_ACTIVATE_INTR(isrc->isrc_dev, isrc, res, data)); } int intr_deactivate_irq(device_t dev, struct resource *res) { struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; int error; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL) panic("Attempt to deactivate non-active resource id: %u\n", res_id); data = rman_get_virtual(res); error = PIC_DEACTIVATE_INTR(isrc->isrc_dev, isrc, res, data); intr_map_set_isrc(res_id, NULL); rman_set_virtual(res, NULL); free(data, M_INTRNG); return (error); } int intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt, driver_intr_t hand, void *arg, int flags, void **cookiep) { int error; struct intr_map_data *data; struct intr_irqsrc *isrc; const char *name; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL) { /* XXX TODO DISCONECTED PICs */ return (EINVAL); } data = rman_get_virtual(res); name = device_get_nameunit(dev); #ifdef INTR_SOLO /* * Standard handling is done through MI interrupt framework. However, * some interrupts could request solely own special handling. This * non standard handling can be used for interrupt controllers without * handler (filter only), so in case that interrupt controllers are * chained, MI interrupt framework is called only in leaf controller. * * Note that root interrupt controller routine is served as well, * however in intr_irq_handler(), i.e. main system dispatch routine. */ if (flags & INTR_SOLO && hand != NULL) { debugf("irq %u cannot solo on %s\n", irq, name); return (EINVAL); } if (flags & INTR_SOLO) { error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt, arg, cookiep); debugf("irq %u setup filter error %d on %s\n", isrc->isrc_irq, error, name); } else #endif { error = isrc_add_handler(isrc, name, filt, hand, arg, flags, cookiep); debugf("irq %u add handler error %d on %s\n", isrc->isrc_irq, error, name); } if (error != 0) return (error); mtx_lock(&isrc_table_lock); error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data); if (error == 0) { isrc->isrc_handlers++; if (isrc->isrc_handlers == 1) PIC_ENABLE_INTR(isrc->isrc_dev, isrc); } mtx_unlock(&isrc_table_lock); if (error != 0) intr_event_remove_handler(*cookiep); return (error); } int intr_teardown_irq(device_t dev, struct resource *res, void *cookie) { int error; struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); data = rman_get_virtual(res); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { if (isrc != cookie) return (EINVAL); mtx_lock(&isrc_table_lock); isrc->isrc_filter = NULL; isrc->isrc_arg = NULL; isrc->isrc_handlers = 0; PIC_DISABLE_INTR(isrc->isrc_dev, isrc); PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); isrc_update_name(isrc, NULL); mtx_unlock(&isrc_table_lock); return (0); } #endif if (isrc != intr_handler_source(cookie)) return (EINVAL); error = intr_event_remove_handler(cookie); if (error == 0) { mtx_lock(&isrc_table_lock); isrc->isrc_handlers--; if (isrc->isrc_handlers == 0) PIC_DISABLE_INTR(isrc->isrc_dev, isrc); PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } int intr_describe_irq(device_t dev, struct resource *res, void *cookie, const char *descr) { int error; struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { if (isrc != cookie) return (EINVAL); mtx_lock(&isrc_table_lock); isrc_update_name(isrc, descr); mtx_unlock(&isrc_table_lock); return (0); } #endif error = intr_event_describe_handler(isrc->isrc_event, cookie, descr); if (error == 0) { mtx_lock(&isrc_table_lock); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } #ifdef SMP int intr_bind_irq(device_t dev, struct resource *res, int cpu) { struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) return (intr_isrc_assign_cpu(isrc, cpu)); #endif return (intr_event_bind(isrc->isrc_event, cpu)); } /* * Return the CPU that the next interrupt source should use. * For now just returns the next CPU according to round-robin. */ u_int intr_irq_next_cpu(u_int last_cpu, cpuset_t *cpumask) { u_int cpu; KASSERT(!CPU_EMPTY(cpumask), ("%s: Empty CPU mask", __func__)); if (!irq_assign_cpu || mp_ncpus == 1) { cpu = PCPU_GET(cpuid); if (CPU_ISSET(cpu, cpumask)) return (curcpu); return (CPU_FFS(cpumask) - 1); } do { last_cpu++; if (last_cpu > mp_maxid) last_cpu = 0; } while (!CPU_ISSET(last_cpu, cpumask)); return (last_cpu); } #ifndef EARLY_AP_STARTUP /* * Distribute all the interrupt sources among the available * CPUs once the AP's have been launched. */ static void intr_irq_shuffle(void *arg __unused) { struct intr_irqsrc *isrc; u_int i; if (mp_ncpus == 1) return; mtx_lock(&isrc_table_lock); irq_assign_cpu = true; for (i = 0; i < intr_nirq; i++) { isrc = irq_sources[i]; if (isrc == NULL || isrc->isrc_handlers == 0 || isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) continue; if (isrc->isrc_event != NULL && isrc->isrc_flags & INTR_ISRCF_BOUND && isrc->isrc_event->ie_cpu != CPU_FFS(&isrc->isrc_cpu) - 1) panic("%s: CPU inconsistency", __func__); if ((isrc->isrc_flags & INTR_ISRCF_BOUND) == 0) CPU_ZERO(&isrc->isrc_cpu); /* start again */ /* * We are in wicked position here if the following call fails * for bound ISRC. The best thing we can do is to clear * isrc_cpu so inconsistency with ie_cpu will be detectable. */ if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0) CPU_ZERO(&isrc->isrc_cpu); } mtx_unlock(&isrc_table_lock); } SYSINIT(intr_irq_shuffle, SI_SUB_SMP, SI_ORDER_SECOND, intr_irq_shuffle, NULL); #endif /* !EARLY_AP_STARTUP */ #else u_int intr_irq_next_cpu(u_int current_cpu, cpuset_t *cpumask) { return (PCPU_GET(cpuid)); } #endif /* SMP */ /* * Allocate memory for new intr_map_data structure. * Initialize common fields. */ struct intr_map_data * intr_alloc_map_data(enum intr_map_data_type type, size_t len, int flags) { struct intr_map_data *data; data = malloc(len, M_INTRNG, flags); data->type = type; data->len = len; return (data); } void intr_free_intr_map_data(struct intr_map_data *data) { free(data, M_INTRNG); } /* * Register a MSI/MSI-X interrupt controller */ int intr_msi_register(device_t dev, intptr_t xref) { struct intr_pic *pic; if (dev == NULL) return (EINVAL); pic = pic_create(dev, xref, FLAG_MSI); if (pic == NULL) return (ENOMEM); debugf("PIC %p registered for %s \n", pic, device_get_nameunit(dev), dev, (uintmax_t)xref); return (0); } int intr_alloc_msi(device_t pci, device_t child, intptr_t xref, int count, int maxcount, int *irqs) { struct iommu_domain *domain; struct intr_irqsrc **isrc; struct intr_pic *pic; device_t pdev; struct intr_map_data_msi *msi; int err, i; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); /* * If this is the first time we have used this context ask the * interrupt controller to map memory the msi source will need. */ err = MSI_IOMMU_INIT(pic->pic_dev, child, &domain); if (err != 0) return (err); isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK); err = MSI_ALLOC_MSI(pic->pic_dev, child, count, maxcount, &pdev, isrc); if (err != 0) { free(isrc, M_INTRNG); return (err); } for (i = 0; i < count; i++) { isrc[i]->isrc_iommu = domain; msi = (struct intr_map_data_msi *)intr_alloc_map_data( INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO); msi-> isrc = isrc[i]; irqs[i] = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi); } free(isrc, M_INTRNG); return (err); } int intr_release_msi(device_t pci, device_t child, intptr_t xref, int count, int *irqs) { struct intr_irqsrc **isrc; struct intr_pic *pic; struct intr_map_data_msi *msi; int i, err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK); for (i = 0; i < count; i++) { msi = (struct intr_map_data_msi *) intr_map_get_map_data(irqs[i]); KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI, ("%s: irq %d map data is not MSI", __func__, irqs[i])); isrc[i] = msi->isrc; } MSI_IOMMU_DEINIT(pic->pic_dev, child); err = MSI_RELEASE_MSI(pic->pic_dev, child, count, isrc); for (i = 0; i < count; i++) { if (isrc[i] != NULL) intr_unmap_irq(irqs[i]); } free(isrc, M_INTRNG); return (err); } int intr_alloc_msix(device_t pci, device_t child, intptr_t xref, int *irq) { struct iommu_domain *domain; struct intr_irqsrc *isrc; struct intr_pic *pic; device_t pdev; struct intr_map_data_msi *msi; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); /* * If this is the first time we have used this context ask the * interrupt controller to map memory the msi source will need. */ err = MSI_IOMMU_INIT(pic->pic_dev, child, &domain); if (err != 0) return (err); err = MSI_ALLOC_MSIX(pic->pic_dev, child, &pdev, &isrc); if (err != 0) return (err); isrc->isrc_iommu = domain; msi = (struct intr_map_data_msi *)intr_alloc_map_data( INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO); msi->isrc = isrc; *irq = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi); return (0); } int intr_release_msix(device_t pci, device_t child, intptr_t xref, int irq) { struct intr_irqsrc *isrc; struct intr_pic *pic; struct intr_map_data_msi *msi; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); msi = (struct intr_map_data_msi *) intr_map_get_map_data(irq); KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI, ("%s: irq %d map data is not MSI", __func__, irq)); isrc = msi->isrc; if (isrc == NULL) { intr_unmap_irq(irq); return (EINVAL); } MSI_IOMMU_DEINIT(pic->pic_dev, child); err = MSI_RELEASE_MSIX(pic->pic_dev, child, isrc); intr_unmap_irq(irq); return (err); } int intr_map_msi(device_t pci, device_t child, intptr_t xref, int irq, uint64_t *addr, uint32_t *data) { struct intr_irqsrc *isrc; struct intr_pic *pic; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); isrc = intr_map_get_isrc(irq); if (isrc == NULL) return (EINVAL); err = MSI_MAP_MSI(pic->pic_dev, child, isrc, addr, data); #ifdef IOMMU if (isrc->isrc_iommu != NULL) iommu_translate_msi(isrc->isrc_iommu, addr); #endif return (err); } void dosoftints(void); void dosoftints(void) { } #ifdef SMP /* * Init interrupt controller on another CPU. */ void intr_pic_init_secondary(void) { /* * QQQ: Only root PIC is aware of other CPUs ??? */ KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__)); //mtx_lock(&isrc_table_lock); PIC_INIT_SECONDARY(intr_irq_root_dev); //mtx_unlock(&isrc_table_lock); } #endif #ifdef DDB -DB_SHOW_COMMAND(irqs, db_show_irqs) +DB_SHOW_COMMAND_FLAGS(irqs, db_show_irqs, DB_CMD_MEMSAFE) { u_int i, irqsum; u_long num; struct intr_irqsrc *isrc; for (irqsum = 0, i = 0; i < intr_nirq; i++) { isrc = irq_sources[i]; if (isrc == NULL) continue; num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0; db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i, isrc->isrc_name, isrc->isrc_cpu.__bits[0], isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num); irqsum += num; } db_printf("irq total %u\n", irqsum); } #endif /* * Interrupt mapping table functions. * * Please, keep this part separately, it can be transformed to * extension of standard resources. */ struct intr_map_entry { device_t dev; intptr_t xref; struct intr_map_data *map_data; struct intr_irqsrc *isrc; /* XXX TODO DISCONECTED PICs */ /*int flags */ }; /* XXX Convert irq_map[] to dynamicaly expandable one. */ static struct intr_map_entry **irq_map; static u_int irq_map_count; static u_int irq_map_first_free_idx; static struct mtx irq_map_lock; static struct intr_irqsrc * intr_map_get_isrc(u_int res_id) { struct intr_irqsrc *isrc; isrc = NULL; mtx_lock(&irq_map_lock); if (res_id < irq_map_count && irq_map[res_id] != NULL) isrc = irq_map[res_id]->isrc; mtx_unlock(&irq_map_lock); return (isrc); } static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc) { mtx_lock(&irq_map_lock); if (res_id < irq_map_count && irq_map[res_id] != NULL) irq_map[res_id]->isrc = isrc; mtx_unlock(&irq_map_lock); } /* * Get a copy of intr_map_entry data */ static struct intr_map_data * intr_map_get_map_data(u_int res_id) { struct intr_map_data *data; data = NULL; mtx_lock(&irq_map_lock); if (res_id >= irq_map_count || irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); data = irq_map[res_id]->map_data; mtx_unlock(&irq_map_lock); return (data); } /* * Get a copy of intr_map_entry data */ static void intr_map_copy_map_data(u_int res_id, device_t *map_dev, intptr_t *map_xref, struct intr_map_data **data) { size_t len; len = 0; mtx_lock(&irq_map_lock); if (res_id >= irq_map_count || irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); if (irq_map[res_id]->map_data != NULL) len = irq_map[res_id]->map_data->len; mtx_unlock(&irq_map_lock); if (len == 0) *data = NULL; else *data = malloc(len, M_INTRNG, M_WAITOK | M_ZERO); mtx_lock(&irq_map_lock); if (irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); if (len != 0) { if (len != irq_map[res_id]->map_data->len) panic("Resource id: %u has changed.\n", res_id); memcpy(*data, irq_map[res_id]->map_data, len); } *map_dev = irq_map[res_id]->dev; *map_xref = irq_map[res_id]->xref; mtx_unlock(&irq_map_lock); } /* * Allocate and fill new entry in irq_map table. */ u_int intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data) { u_int i; struct intr_map_entry *entry; /* Prepare new entry first. */ entry = malloc(sizeof(*entry), M_INTRNG, M_WAITOK | M_ZERO); entry->dev = dev; entry->xref = xref; entry->map_data = data; entry->isrc = NULL; mtx_lock(&irq_map_lock); for (i = irq_map_first_free_idx; i < irq_map_count; i++) { if (irq_map[i] == NULL) { irq_map[i] = entry; irq_map_first_free_idx = i + 1; mtx_unlock(&irq_map_lock); return (i); } } mtx_unlock(&irq_map_lock); /* XXX Expand irq_map table */ panic("IRQ mapping table is full."); } /* * Remove and free mapping entry. */ void intr_unmap_irq(u_int res_id) { struct intr_map_entry *entry; mtx_lock(&irq_map_lock); if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) panic("Attempt to unmap invalid resource id: %u\n", res_id); entry = irq_map[res_id]; irq_map[res_id] = NULL; irq_map_first_free_idx = res_id; mtx_unlock(&irq_map_lock); intr_free_intr_map_data(entry->map_data); free(entry, M_INTRNG); } /* * Clone mapping entry. */ u_int intr_map_clone_irq(u_int old_res_id) { device_t map_dev; intptr_t map_xref; struct intr_map_data *data; intr_map_copy_map_data(old_res_id, &map_dev, &map_xref, &data); return (intr_map_irq(map_dev, map_xref, data)); } static void intr_map_init(void *dummy __unused) { mtx_init(&irq_map_lock, "intr map table", NULL, MTX_DEF); irq_map_count = 2 * intr_nirq; irq_map = mallocarray(irq_map_count, sizeof(struct intr_map_entry*), M_INTRNG, M_WAITOK | M_ZERO); } SYSINIT(intr_map_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_map_init, NULL); diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c index aaa9b62bb936..c4c220d66c97 100644 --- a/sys/kern/subr_pcpu.c +++ b/sys/kern/subr_pcpu.c @@ -1,425 +1,425 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Wind River Systems, Inc. * All rights reserved. * Written by: John Baldwin * * Copyright (c) 2009 Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module provides MI support for per-cpu data. * * Each architecture determines the mapping of logical CPU IDs to physical * CPUs. The requirements of this mapping are as follows: * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1. * - The mapping is not required to be dense. That is, there may be * gaps in the mappings. * - The platform sets the value of MAXCPU in . * - It is suggested, but not required, that in the non-SMP case, the * platform define MAXCPU to be 1 and define the logical ID of the * sole CPU as 0. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting."); struct dpcpu_free { uintptr_t df_start; int df_len; TAILQ_ENTRY(dpcpu_free) df_link; }; DPCPU_DEFINE_STATIC(char, modspace[DPCPU_MODMIN] __aligned(__alignof(void *))); static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head); static struct sx dpcpu_lock; uintptr_t dpcpu_off[MAXCPU]; struct pcpu *cpuid_to_pcpu[MAXCPU]; struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead); /* * Initialize the MI portions of a struct pcpu. */ void pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { bzero(pcpu, size); KASSERT(cpuid >= 0 && cpuid < MAXCPU, ("pcpu_init: invalid cpuid %d", cpuid)); pcpu->pc_cpuid = cpuid; cpuid_to_pcpu[cpuid] = pcpu; STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu); cpu_pcpu_init(pcpu, cpuid, size); pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue; pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue; pcpu->pc_zpcpu_offset = zpcpu_offset_cpu(cpuid); } void dpcpu_init(void *dpcpu, int cpuid) { struct pcpu *pcpu; pcpu = pcpu_find(cpuid); pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START; /* * Initialize defaults from our linker section. */ memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES); /* * Place it in the global pcpu offset array. */ dpcpu_off[cpuid] = pcpu->pc_dynamic; } static void dpcpu_startup(void *dummy __unused) { struct dpcpu_free *df; df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); df->df_start = (uintptr_t)&DPCPU_NAME(modspace); df->df_len = DPCPU_MODMIN; TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link); sx_init(&dpcpu_lock, "dpcpu alloc lock"); } SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, NULL); /* * UMA_ZONE_PCPU zones for general kernel use. */ uma_zone_t pcpu_zone_4; uma_zone_t pcpu_zone_8; uma_zone_t pcpu_zone_16; uma_zone_t pcpu_zone_32; uma_zone_t pcpu_zone_64; static void pcpu_zones_startup(void) { pcpu_zone_4 = uma_zcreate("pcpu-4", 4, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_8 = uma_zcreate("pcpu-8", 8, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_16 = uma_zcreate("pcpu-16", 16, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_32 = uma_zcreate("pcpu-32", 32, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_64 = uma_zcreate("pcpu-64", 64, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_COUNTER, SI_ORDER_FIRST, pcpu_zones_startup, NULL); /* * First-fit extent based allocator for allocating space in the per-cpu * region reserved for modules. This is only intended for use by the * kernel linkers to place module linker sets. */ void * dpcpu_alloc(int size) { struct dpcpu_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_len < size) continue; if (df->df_len == size) { s = (void *)df->df_start; TAILQ_REMOVE(&dpcpu_head, df, df_link); free(df, M_PCPU); break; } s = (void *)df->df_start; df->df_len -= size; df->df_start = df->df_start + size; break; } sx_xunlock(&dpcpu_lock); return (s); } /* * Free dynamic per-cpu space at module unload time. */ void dpcpu_free(void *s, int size) { struct dpcpu_free *df; struct dpcpu_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)s; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_start > end) break; /* * If we expand at the end of an entry we may have to * merge it with the one following it as well. */ if (df->df_start + df->df_len == start) { df->df_len += size; dn = TAILQ_NEXT(df, df_link); if (df->df_start + df->df_len == dn->df_start) { df->df_len += dn->df_len; TAILQ_REMOVE(&dpcpu_head, dn, df_link); free(dn, M_PCPU); } sx_xunlock(&dpcpu_lock); return; } if (df->df_start == end) { df->df_start = start; df->df_len += size; sx_xunlock(&dpcpu_lock); return; } } dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); dn->df_start = start; dn->df_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, df_link); else TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link); sx_xunlock(&dpcpu_lock); } /* * Initialize the per-cpu storage from an updated linker-set region. */ void dpcpu_copy(void *s, int size) { #ifdef SMP uintptr_t dpcpu; int i; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; memcpy((void *)(dpcpu + (uintptr_t)s), s, size); } #else memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size); #endif } /* * Destroy a struct pcpu. */ void pcpu_destroy(struct pcpu *pcpu) { STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu); cpuid_to_pcpu[pcpu->pc_cpuid] = NULL; dpcpu_off[pcpu->pc_cpuid] = 0; } /* * Locate a struct pcpu by cpu id. */ struct pcpu * pcpu_find(u_int cpuid) { return (cpuid_to_pcpu[cpuid]); } int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int64_t count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int64_t *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; long count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(long *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } #ifdef DDB -DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off) +DB_SHOW_COMMAND_FLAGS(dpcpu_off, db_show_dpcpu_off, DB_CMD_MEMSAFE) { int id; CPU_FOREACH(id) { db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n", id, (uintmax_t)dpcpu_off[id], (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START)); } } static void show_pcpu(struct pcpu *pc) { struct thread *td; db_printf("cpuid = %d\n", pc->pc_cpuid); db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic); db_printf("curthread = "); td = pc->pc_curthread; if (td != NULL) db_printf("%p: pid %d tid %d critnest %d \"%s\"\n", td, td->td_proc->p_pid, td->td_tid, td->td_critnest, td->td_name); else db_printf("none\n"); db_printf("curpcb = %p\n", pc->pc_curpcb); db_printf("fpcurthread = "); td = pc->pc_fpcurthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("idlethread = "); td = pc->pc_idlethread; if (td != NULL) db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name); else db_printf("none\n"); db_show_mdpcpu(pc); #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); #endif #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks, db_printf); #endif } -DB_SHOW_COMMAND(pcpu, db_show_pcpu) +DB_SHOW_COMMAND_FLAGS(pcpu, db_show_pcpu, DB_CMD_MEMSAFE) { struct pcpu *pc; int id; if (have_addr) id = ((addr >> 4) % 16) * 10 + (addr % 16); else id = PCPU_GET(cpuid); pc = pcpu_find(id); if (pc == NULL) { db_printf("CPU %d not found\n", id); return; } show_pcpu(pc); } DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all) { struct pcpu *pc; int id; db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid)); CPU_FOREACH(id) { pc = pcpu_find(id); if (pc != NULL) { show_pcpu(pc); db_printf("\n"); } } } -DB_SHOW_ALIAS(allpcpu, db_show_cpu_all); +DB_SHOW_ALIAS_FLAGS(allpcpu, db_show_cpu_all, DB_CMD_MEMSAFE); #endif diff --git a/sys/kern/subr_physmem.c b/sys/kern/subr_physmem.c index 2c7837019ea2..5caff7da4f50 100644 --- a/sys/kern/subr_physmem.c +++ b/sys/kern/subr_physmem.c @@ -1,524 +1,524 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include "opt_acpi.h" #include "opt_ddb.h" #endif /* * Routines for describing and initializing anything related to physical memory. */ #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #else #include #include #include #endif /* * These structures are used internally to keep track of regions of physical * ram, and regions within the physical ram that need to be excluded. An * exclusion region can be excluded from crash dumps, from the vm pool of pages * that can be allocated, or both, depending on the exclusion flags associated * with the region. */ #ifdef DEV_ACPI #define MAX_HWCNT 32 /* ACPI needs more regions */ #define MAX_EXCNT 32 #else #define MAX_HWCNT 16 #define MAX_EXCNT 16 #endif #if defined(__arm__) #define MAX_PHYS_ADDR 0xFFFFFFFFull #elif defined(__aarch64__) || defined(__amd64__) || defined(__riscv) #define MAX_PHYS_ADDR 0xFFFFFFFFFFFFFFFFull #endif struct region { vm_paddr_t addr; vm_size_t size; uint32_t flags; }; static struct region hwregions[MAX_HWCNT]; static struct region exregions[MAX_EXCNT]; static size_t hwcnt; static size_t excnt; /* * realmem is the total number of hardware pages, excluded or not. * Maxmem is one greater than the last physical page number. */ long realmem; long Maxmem; #ifndef _KERNEL static void panic(const char *fmt, ...) { va_list va; va_start(va, fmt); vfprintf(stderr, fmt, va); fprintf(stderr, "\n"); va_end(va); __builtin_trap(); } #endif /* * Print the contents of the physical and excluded region tables using the * provided printf-like output function (which will be either printf or * db_printf). */ static void physmem_dump_tables(int (*prfunc)(const char *, ...)) { size_t i; int flags; uintmax_t addr, size; const unsigned int mbyte = 1024 * 1024; prfunc("Physical memory chunk(s):\n"); for (i = 0; i < hwcnt; ++i) { addr = hwregions[i].addr; size = hwregions[i].size; prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages)\n", addr, addr + size - 1, size / mbyte, size / PAGE_SIZE); } prfunc("Excluded memory regions:\n"); for (i = 0; i < excnt; ++i) { addr = exregions[i].addr; size = exregions[i].size; flags = exregions[i].flags; prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages) %s %s\n", addr, addr + size - 1, size / mbyte, size / PAGE_SIZE, (flags & EXFLAG_NOALLOC) ? "NoAlloc" : "", (flags & EXFLAG_NODUMP) ? "NoDump" : ""); } #ifdef DEBUG prfunc("Avail lists:\n"); for (i = 0; phys_avail[i] != 0; ++i) { prfunc(" phys_avail[%d] 0x%08x\n", i, phys_avail[i]); } for (i = 0; dump_avail[i] != 0; ++i) { prfunc(" dump_avail[%d] 0x%08x\n", i, dump_avail[i]); } #endif } /* * Print the contents of the static mapping table. Used for bootverbose. */ void physmem_print_tables(void) { physmem_dump_tables(printf); } /* * Walk the list of hardware regions, processing it against the list of * exclusions that contain the given exflags, and generating an "avail list". * * If maxphyssz is not zero it sets upper limit, in bytes, for the total * "avail list" size. Walk stops once the limit is reached and the last region * is cut short if necessary. * * Updates the value at *pavail with the sum of all pages in all hw regions. * * Returns the number of pages of non-excluded memory added to the avail list. */ static size_t regions_to_avail(vm_paddr_t *avail, uint32_t exflags, size_t maxavail, uint64_t maxphyssz, long *pavail, long *prealmem) { size_t acnt, exi, hwi; uint64_t adj, end, start, xend, xstart; long availmem, totalmem; const struct region *exp, *hwp; uint64_t availsz; totalmem = 0; availmem = 0; availsz = 0; acnt = 0; for (hwi = 0, hwp = hwregions; hwi < hwcnt; ++hwi, ++hwp) { adj = round_page(hwp->addr) - hwp->addr; start = round_page(hwp->addr); end = trunc_page(hwp->size + adj) + start; totalmem += atop((vm_offset_t)(end - start)); for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { /* * If the excluded region does not match given flags, * continue checking with the next excluded region. */ if ((exp->flags & exflags) == 0) continue; xstart = exp->addr; xend = exp->size + xstart; /* * If the excluded region ends before this hw region, * continue checking with the next excluded region. */ if (xend <= start) continue; /* * If the excluded region begins after this hw region * we're done because both lists are sorted. */ if (xstart >= end) break; /* * If the excluded region completely covers this hw * region, shrink this hw region to zero size. */ if ((start >= xstart) && (end <= xend)) { start = xend; end = xend; break; } /* * If the excluded region falls wholly within this hw * region without abutting or overlapping the beginning * or end, create an available entry from the leading * fragment, then adjust the start of this hw region to * the end of the excluded region, and continue checking * the next excluded region because another exclusion * could affect the remainder of this hw region. */ if ((xstart > start) && (xend < end)) { if ((maxphyssz != 0) && (availsz + xstart - start > maxphyssz)) { xstart = maxphyssz + start - availsz; } if (xstart <= start) continue; if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { avail[acnt - 1] = (vm_paddr_t)xstart; } else { avail[acnt++] = (vm_paddr_t)start; avail[acnt++] = (vm_paddr_t)xstart; } availsz += (xstart - start); availmem += atop((vm_offset_t)(xstart - start)); start = xend; continue; } /* * We know the excluded region overlaps either the start * or end of this hardware region (but not both), trim * the excluded portion off the appropriate end. */ if (xstart <= start) start = xend; else end = xstart; } /* * If the trimming actions above left a non-zero size, create an * available entry for it. */ if (end > start) { if ((maxphyssz != 0) && (availsz + end - start > maxphyssz)) { end = maxphyssz + start - availsz; } if (end <= start) break; if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { avail[acnt - 1] = (vm_paddr_t)end; } else { avail[acnt++] = (vm_paddr_t)start; avail[acnt++] = (vm_paddr_t)end; } availsz += end - start; availmem += atop((vm_offset_t)(end - start)); } if (acnt >= maxavail) panic("Not enough space in the dump/phys_avail arrays"); } if (pavail != NULL) *pavail = availmem; if (prealmem != NULL) *prealmem = totalmem; return (acnt); } /* * Check if the region at idx can be merged with the region above it. */ static size_t merge_upper_regions(struct region *regions, size_t rcnt, size_t idx) { struct region *lower, *upper; vm_paddr_t lend, uend; size_t i, mergecnt, movecnt; lower = ®ions[idx]; lend = lower->addr + lower->size; /* * Continue merging in upper entries as long as we have entries to * merge; the new block could have spanned more than one, although one * is likely the common case. */ for (i = idx + 1; i < rcnt; i++) { upper = ®ions[i]; if (lend < upper->addr || lower->flags != upper->flags) break; uend = upper->addr + upper->size; if (uend > lend) { lower->size += uend - lend; lend = lower->addr + lower->size; } if (uend >= lend) { /* * If we didn't move past the end of the upper region, * then we don't need to bother checking for another * merge because it would have been done already. Just * increment i once more to maintain the invariant that * i is one past the last entry merged. */ i++; break; } } /* * We merged in the entries from [idx + 1, i); physically move the tail * end at [i, rcnt) if we need to. */ mergecnt = i - (idx + 1); if (mergecnt > 0) { movecnt = rcnt - i; if (movecnt == 0) { /* Merged all the way to the end, just decrease rcnt. */ rcnt = idx + 1; } else { memmove(®ions[idx + 1], ®ions[idx + mergecnt + 1], movecnt * sizeof(*regions)); rcnt -= mergecnt; } } return (rcnt); } /* * Insertion-sort a new entry into a regions list; sorted by start address. */ static size_t insert_region(struct region *regions, size_t rcnt, vm_paddr_t addr, vm_size_t size, uint32_t flags) { size_t i; vm_paddr_t nend, rend; struct region *ep, *rp; nend = addr + size; ep = regions + rcnt; for (i = 0, rp = regions; i < rcnt; ++i, ++rp) { if (flags == rp->flags) { rend = rp->addr + rp->size; if (addr <= rp->addr && nend >= rp->addr) { /* * New mapping overlaps at the beginning, shift * for any difference in the beginning then * shift if the new mapping extends past. */ rp->size += rp->addr - addr; rp->addr = addr; if (nend > rend) { rp->size += nend - rend; rcnt = merge_upper_regions(regions, rcnt, i); } return (rcnt); } else if (addr <= rend && nend > rp->addr) { /* * New mapping is either entirely contained * within or it's overlapping at the end. */ if (nend > rend) { rp->size += nend - rend; rcnt = merge_upper_regions(regions, rcnt, i); } return (rcnt); } } if (addr < rp->addr) { bcopy(rp, rp + 1, (ep - rp) * sizeof(*rp)); break; } } rp->addr = addr; rp->size = size; rp->flags = flags; rcnt++; return (rcnt); } /* * Add a hardware memory region. */ void physmem_hardware_region(uint64_t pa, uint64_t sz) { /* * Filter out the page at PA 0x00000000. The VM can't handle it, as * pmap_extract() == 0 means failure. */ if (pa == 0) { if (sz <= PAGE_SIZE) return; pa = PAGE_SIZE; sz -= PAGE_SIZE; } else if (pa > MAX_PHYS_ADDR) { /* This range is past usable memory, ignore it */ return; } /* * Also filter out the page at the end of the physical address space -- * if addr is non-zero and addr+size is zero we wrapped to the next byte * beyond what vm_paddr_t can express. That leads to a NULL pointer * deref early in startup; work around it by leaving the last page out. * * XXX This just in: subtract out a whole megabyte, not just 1 page. * Reducing the size by anything less than 1MB results in the NULL * pointer deref in _vm_map_lock_read(). Better to give up a megabyte * than leave some folks with an unusable system while we investigate. */ if ((pa + sz) > (MAX_PHYS_ADDR - 1024 * 1024)) { sz = MAX_PHYS_ADDR - pa + 1; if (sz <= 1024 * 1024) return; sz -= 1024 * 1024; } if (sz > 0 && hwcnt < nitems(hwregions)) hwcnt = insert_region(hwregions, hwcnt, pa, sz, 0); } /* * Add an exclusion region. */ void physmem_exclude_region(vm_paddr_t pa, vm_size_t sz, uint32_t exflags) { vm_offset_t adj; /* * Truncate the starting address down to a page boundary, and round the * ending page up to a page boundary. */ adj = pa - trunc_page(pa); pa = trunc_page(pa); sz = round_page(sz + adj); if (excnt >= nitems(exregions)) panic("failed to exclude region %#jx-%#jx", (uintmax_t)pa, (uintmax_t)(pa + sz)); excnt = insert_region(exregions, excnt, pa, sz, exflags); } size_t physmem_avail(vm_paddr_t *avail, size_t maxavail) { return (regions_to_avail(avail, EXFLAG_NOALLOC, maxavail, 0, NULL, NULL)); } #ifdef _KERNEL /* * Process all the regions added earlier into the global avail lists. * * Updates the kernel global 'physmem' with the number of physical pages * available for use (all pages not in any exclusion region). * * Updates the kernel global 'Maxmem' with the page number one greater then the * last page of physical memory in the system. */ void physmem_init_kernel_globals(void) { size_t nextidx; u_long hwphyssz; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", &hwphyssz); regions_to_avail(dump_avail, EXFLAG_NODUMP, PHYS_AVAIL_ENTRIES, hwphyssz, NULL, NULL); nextidx = regions_to_avail(phys_avail, EXFLAG_NOALLOC, PHYS_AVAIL_ENTRIES, hwphyssz, &physmem, &realmem); if (nextidx == 0) panic("No memory entries in phys_avail"); Maxmem = atop(phys_avail[nextidx - 1]); } #endif #ifdef DDB #include -DB_SHOW_COMMAND(physmem, db_show_physmem) +DB_SHOW_COMMAND_FLAGS(physmem, db_show_physmem, DB_CMD_MEMSAFE) { physmem_dump_tables(db_printf); } #endif /* DDB */ diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index 62a0bd0a3699..0506c21b5ad8 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -1,1334 +1,1334 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include "opt_ddb.h" #include "opt_printf.h" #endif /* _KERNEL */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #endif #include #include #ifdef DDB #include #endif /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #ifdef _KERNEL #include #else #include #endif /* * This is needed for sbuf_putbuf() when compiled into userland. Due to the * shared nature of this file, it's the only place to put it. */ #ifndef _KERNEL #include #endif #ifdef _KERNEL #define TOCONS 0x01 #define TOTTY 0x02 #define TOLOG 0x04 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ #define MAXNBUF (sizeof(intmax_t) * NBBY + 1) struct putchar_arg { int flags; int pri; struct tty *tty; char *p_bufr; size_t n_bufr; char *p_next; size_t remain; }; struct snprintf_arg { char *str; size_t remain; }; extern int log_open; static void msglogchar(int c, int pri); static void msglogstr(char *str, int pri, int filter_cr); static void prf_putbuf(char *bufr, int flags, int pri); static void putchar(int ch, void *arg); static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper); static void snprintf_func(int ch, void *arg); static bool msgbufmapped; /* Set when safe to use msgbuf */ int msgbuftrigger; struct msgbuf *msgbufp; #ifndef BOOT_TAG_SZ #define BOOT_TAG_SZ 32 #endif #ifndef BOOT_TAG /* Tag used to mark the start of a boot in dmesg */ #define BOOT_TAG "---<>---" #endif static char current_boot_tag[BOOT_TAG_SZ + 1] = BOOT_TAG; SYSCTL_STRING(_kern, OID_AUTO, boot_tag, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, current_boot_tag, 0, "Tag added to dmesg at start of boot"); static int log_console_output = 1; SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN, &log_console_output, 0, "Duplicate console output to the syslog"); /* * See the comment in log_console() below for more explanation of this. */ static int log_console_add_linefeed; SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN, &log_console_add_linefeed, 0, "log_console() adds extra newlines"); static int always_console_output; SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN, &always_console_output, 0, "Always output to console despite TIOCCONS"); /* * Warn that a system table is full. */ void tablefull(const char *tab) { log(LOG_ERR, "%s: table is full\n", tab); } /* * Uprintf prints to the controlling terminal for the current process. */ int uprintf(const char *fmt, ...) { va_list ap; struct putchar_arg pca; struct proc *p; struct thread *td; int retval; td = curthread; if (TD_IS_IDLETHREAD(td)) return (0); if (td->td_proc == initproc) { /* Produce output when we fail to load /sbin/init: */ va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return (retval); } sx_slock(&proctree_lock); p = td->td_proc; PROC_LOCK(p); if ((p->p_flag & P_CONTROLT) == 0) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); return (0); } SESS_LOCK(p->p_session); pca.tty = p->p_session->s_ttyp; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (pca.tty == NULL) { sx_sunlock(&proctree_lock); return (0); } pca.flags = TOTTY; pca.p_bufr = NULL; va_start(ap, fmt); tty_lock(pca.tty); sx_sunlock(&proctree_lock); retval = kvprintf(fmt, putchar, &pca, 10, ap); tty_unlock(pca.tty); va_end(ap); return (retval); } /* * tprintf and vtprintf print on the controlling terminal associated with the * given session, possibly to the log as well. */ void tprintf(struct proc *p, int pri, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vtprintf(p, pri, fmt, ap); va_end(ap); } void vtprintf(struct proc *p, int pri, const char *fmt, va_list ap) { struct tty *tp = NULL; int flags = 0; struct putchar_arg pca; struct session *sess = NULL; sx_slock(&proctree_lock); if (pri != -1) flags |= TOLOG; if (p != NULL) { PROC_LOCK(p); if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { sess = p->p_session; sess_hold(sess); PROC_UNLOCK(p); tp = sess->s_ttyp; if (tp != NULL && tty_checkoutq(tp)) flags |= TOTTY; else tp = NULL; } else PROC_UNLOCK(p); } pca.pri = pri; pca.tty = tp; pca.flags = flags; pca.p_bufr = NULL; if (pca.tty != NULL) tty_lock(pca.tty); sx_sunlock(&proctree_lock); kvprintf(fmt, putchar, &pca, 10, ap); if (pca.tty != NULL) tty_unlock(pca.tty); if (sess != NULL) sess_release(sess); msgbuftrigger = 1; } static int _vprintf(int level, int flags, const char *fmt, va_list ap) { struct putchar_arg pca; int retval; #ifdef PRINTF_BUFR_SIZE char bufr[PRINTF_BUFR_SIZE]; #endif TSENTER(); pca.tty = NULL; pca.pri = level; pca.flags = flags; #ifdef PRINTF_BUFR_SIZE pca.p_bufr = bufr; pca.p_next = pca.p_bufr; pca.n_bufr = sizeof(bufr); pca.remain = sizeof(bufr); *pca.p_next = '\0'; #else /* Don't buffer console output. */ pca.p_bufr = NULL; #endif retval = kvprintf(fmt, putchar, &pca, 10, ap); #ifdef PRINTF_BUFR_SIZE /* Write any buffered console/log output: */ if (*pca.p_bufr != '\0') prf_putbuf(pca.p_bufr, flags, level); #endif TSEXIT(); return (retval); } /* * Log writes to the log buffer, and guarantees not to sleep (so can be * called by interrupt routines). If there is no process reading the * log yet, it writes to the console also. */ void log(int level, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vlog(level, fmt, ap); va_end(ap); } void vlog(int level, const char *fmt, va_list ap) { (void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap); msgbuftrigger = 1; } #define CONSCHUNK 128 void log_console(struct uio *uio) { int c, error, nl; char *consbuffer; int pri; if (!log_console_output) return; pri = LOG_INFO | LOG_CONSOLE; uio = cloneuio(uio); consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK); nl = 0; while (uio->uio_resid > 0) { c = imin(uio->uio_resid, CONSCHUNK - 1); error = uiomove(consbuffer, c, uio); if (error != 0) break; /* Make sure we're NUL-terminated */ consbuffer[c] = '\0'; if (consbuffer[c - 1] == '\n') nl = 1; else nl = 0; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } /* * The previous behavior in log_console() is preserved when * log_console_add_linefeed is non-zero. For that behavior, if an * individual console write came in that was not terminated with a * line feed, it would add a line feed. * * This results in different data in the message buffer than * appears on the system console (which doesn't add extra line feed * characters). * * A number of programs and rc scripts write a line feed, or a period * and a line feed when they have completed their operation. On * the console, this looks seamless, but when displayed with * 'dmesg -a', you wind up with output that looks like this: * * Updating motd: * . * * On the console, it looks like this: * Updating motd:. * * We could add logic to detect that situation, or just not insert * the extra newlines. Set the kern.log_console_add_linefeed * sysctl/tunable variable to get the old behavior. */ if (!nl && log_console_add_linefeed) { consbuffer[0] = '\n'; consbuffer[1] = '\0'; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } msgbuftrigger = 1; free(uio, M_IOV); free(consbuffer, M_TEMP); } int printf(const char *fmt, ...) { va_list ap; int retval; va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return (retval); } int vprintf(const char *fmt, va_list ap) { int retval; retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap); if (!KERNEL_PANICKED()) msgbuftrigger = 1; return (retval); } static void prf_putchar(int c, int flags, int pri) { if (flags & TOLOG) msglogchar(c, pri); if (flags & TOCONS) { if ((!KERNEL_PANICKED()) && (constty != NULL)) msgbuf_addchar(&consmsgbuf, c); if ((constty == NULL) || always_console_output) cnputc(c); } } static void prf_putbuf(char *bufr, int flags, int pri) { if (flags & TOLOG) msglogstr(bufr, pri, /*filter_cr*/1); if (flags & TOCONS) { if ((!KERNEL_PANICKED()) && (constty != NULL)) msgbuf_addstr(&consmsgbuf, -1, bufr, /*filter_cr*/ 0); if ((constty == NULL) || always_console_output) cnputs(bufr); } } static void putbuf(int c, struct putchar_arg *ap) { /* Check if no console output buffer was provided. */ if (ap->p_bufr == NULL) { prf_putchar(c, ap->flags, ap->pri); } else { /* Buffer the character: */ *ap->p_next++ = c; ap->remain--; /* Always leave the buffer zero terminated. */ *ap->p_next = '\0'; /* Check if the buffer needs to be flushed. */ if (ap->remain == 2 || c == '\n') { prf_putbuf(ap->p_bufr, ap->flags, ap->pri); ap->p_next = ap->p_bufr; ap->remain = ap->n_bufr; *ap->p_next = '\0'; } /* * Since we fill the buffer up one character at a time, * this should not happen. We should always catch it when * ap->remain == 2 (if not sooner due to a newline), flush * the buffer and move on. One way this could happen is * if someone sets PRINTF_BUFR_SIZE to 1 or something * similarly silly. */ KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd", ap->remain)); } } /* * Print a character on console or users terminal. If destination is * the console then the last bunch of characters are saved in msgbuf for * inspection later. */ static void putchar(int c, void *arg) { struct putchar_arg *ap = (struct putchar_arg*) arg; struct tty *tp = ap->tty; int flags = ap->flags; /* Don't use the tty code after a panic or while in ddb. */ if (kdb_active) { if (c != '\0') cnputc(c); return; } if ((flags & TOTTY) && tp != NULL && !KERNEL_PANICKED()) tty_putchar(tp, c); if ((flags & (TOCONS | TOLOG)) && c != '\0') putbuf(c, ap); } /* * Scaled down version of sprintf(3). */ int sprintf(char *buf, const char *cfmt, ...) { int retval; va_list ap; va_start(ap, cfmt); retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; va_end(ap); return (retval); } /* * Scaled down version of vsprintf(3). */ int vsprintf(char *buf, const char *cfmt, va_list ap) { int retval; retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; return (retval); } /* * Scaled down version of snprintf(3). */ int snprintf(char *str, size_t size, const char *format, ...) { int retval; va_list ap; va_start(ap, format); retval = vsnprintf(str, size, format, ap); va_end(ap); return(retval); } /* * Scaled down version of vsnprintf(3). */ int vsnprintf(char *str, size_t size, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, 10, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } /* * Kernel version which takes radix argument vsnprintf(3). */ int vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, radix, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } static void snprintf_func(int ch, void *arg) { struct snprintf_arg *const info = arg; if (info->remain >= 2) { *info->str++ = ch; info->remain--; } } /* * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse * order; return an optional length and a pointer to the last character * written in the buffer (i.e., the first character of the string). * The buffer pointed to by `nbuf' must have length >= MAXNBUF. */ static char * ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) { char *p, c; p = nbuf; *p = '\0'; do { c = hex2ascii(num % base); *++p = upper ? toupper(c) : c; } while (num /= base); if (lenp) *lenp = p - nbuf; return (p); } /* * Scaled down version of printf(3). * * Two additional formats: * * The format %b is supported to decode error registers. * Its usage is: * * printf("reg=%b\n", regval, "*"); * * where is the output base expressed as a control character, e.g. * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, * the first of which gives the bit number to be inspected (origin 1), and * the next characters (up to a control character, i.e. a character <= 32), * give the name of the register. Thus: * * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE"); * * would produce output: * * reg=3 * * XXX: %D -- Hexdump, takes pointer and separator string: * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX * ("%*D", len, ptr, " " -> XX XX XX XX ... */ int kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) { #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } char nbuf[MAXNBUF]; char *d; const char *p, *percent, *q; u_char *up; int ch, n; uintmax_t num; int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; int cflag, hflag, jflag, tflag, zflag; int bconv, dwidth, upper; char padc; int stop = 0, retval = 0; num = 0; q = NULL; if (!func) d = (char *) arg; else d = NULL; if (fmt == NULL) fmt = "(fmt null)\n"; if (radix < 2 || radix > 36) radix = 10; for (;;) { padc = ' '; width = 0; while ((ch = (u_char)*fmt++) != '%' || stop) { if (ch == '\0') return (retval); PCHAR(ch); } percent = fmt - 1; qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0; cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; reswitch: switch (ch = (u_char)*fmt++) { case '.': dot = 1; goto reswitch; case '#': sharpflag = 1; goto reswitch; case '+': sign = 1; goto reswitch; case '-': ladjust = 1; goto reswitch; case '%': PCHAR(ch); break; case '*': if (!dot) { width = va_arg(ap, int); if (width < 0) { ladjust = !ladjust; width = -width; } } else { dwidth = va_arg(ap, int); } goto reswitch; case '0': if (!dot) { padc = '0'; goto reswitch; } /* FALLTHROUGH */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': for (n = 0;; ++fmt) { n = n * 10 + ch - '0'; ch = *fmt; if (ch < '0' || ch > '9') break; } if (dot) dwidth = n; else width = n; goto reswitch; case 'b': ladjust = 1; bconv = 1; goto handle_nosign; case 'c': width -= 1; if (!ladjust && width > 0) while (width--) PCHAR(padc); PCHAR(va_arg(ap, int)); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 'D': up = va_arg(ap, u_char *); p = va_arg(ap, char *); if (!width) width = 16; while(width--) { PCHAR(hex2ascii(*up >> 4)); PCHAR(hex2ascii(*up & 0x0f)); up++; if (width) for (q=p;*q;q++) PCHAR(*q); } break; case 'd': case 'i': base = 10; sign = 1; goto handle_sign; case 'h': if (hflag) { hflag = 0; cflag = 1; } else hflag = 1; goto reswitch; case 'j': jflag = 1; goto reswitch; case 'l': if (lflag) { lflag = 0; qflag = 1; } else lflag = 1; goto reswitch; case 'n': /* * We do not support %n in kernel, but consume the * argument. */ if (jflag) (void)va_arg(ap, intmax_t *); else if (qflag) (void)va_arg(ap, quad_t *); else if (lflag) (void)va_arg(ap, long *); else if (zflag) (void)va_arg(ap, size_t *); else if (hflag) (void)va_arg(ap, short *); else if (cflag) (void)va_arg(ap, char *); else (void)va_arg(ap, int *); break; case 'o': base = 8; goto handle_nosign; case 'p': base = 16; sharpflag = (width == 0); sign = 0; num = (uintptr_t)va_arg(ap, void *); goto number; case 'q': qflag = 1; goto reswitch; case 'r': base = radix; if (sign) goto handle_sign; goto handle_nosign; case 's': p = va_arg(ap, char *); if (p == NULL) p = "(null)"; if (!dot) n = strlen (p); else for (n = 0; n < dwidth && p[n]; n++) continue; width -= n; if (!ladjust && width > 0) while (width--) PCHAR(padc); while (n--) PCHAR(*p++); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 't': tflag = 1; goto reswitch; case 'u': base = 10; goto handle_nosign; case 'X': upper = 1; /* FALLTHROUGH */ case 'x': base = 16; goto handle_nosign; case 'y': base = 16; sign = 1; goto handle_sign; case 'z': zflag = 1; goto reswitch; handle_nosign: sign = 0; if (jflag) num = va_arg(ap, uintmax_t); else if (qflag) num = va_arg(ap, u_quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, u_long); else if (zflag) num = va_arg(ap, size_t); else if (hflag) num = (u_short)va_arg(ap, int); else if (cflag) num = (u_char)va_arg(ap, int); else num = va_arg(ap, u_int); if (bconv) { q = va_arg(ap, char *); base = *q++; } goto number; handle_sign: if (jflag) num = va_arg(ap, intmax_t); else if (qflag) num = va_arg(ap, quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, long); else if (zflag) num = va_arg(ap, ssize_t); else if (hflag) num = (short)va_arg(ap, int); else if (cflag) num = (char)va_arg(ap, int); else num = va_arg(ap, int); number: if (sign && (intmax_t)num < 0) { neg = 1; num = -(intmax_t)num; } p = ksprintn(nbuf, num, base, &n, upper); tmp = 0; if (sharpflag && num != 0) { if (base == 8) tmp++; else if (base == 16) tmp += 2; } if (neg) tmp++; if (!ladjust && padc == '0') dwidth = width - tmp; width -= tmp + imax(dwidth, n); dwidth -= n; if (!ladjust) while (width-- > 0) PCHAR(' '); if (neg) PCHAR('-'); if (sharpflag && num != 0) { if (base == 8) { PCHAR('0'); } else if (base == 16) { PCHAR('0'); PCHAR('x'); } } while (dwidth-- > 0) PCHAR('0'); while (*p) PCHAR(*p--); if (bconv && num != 0) { /* %b conversion flag format. */ tmp = retval; while (*q) { n = *q++; if (num & (1 << (n - 1))) { PCHAR(retval != tmp ? ',' : '<'); for (; (n = *q) > ' '; ++q) PCHAR(n); } else for (; *q > ' '; ++q) continue; } if (retval != tmp) { PCHAR('>'); width -= retval - tmp; } } if (ladjust) while (width-- > 0) PCHAR(' '); break; default: while (percent < fmt) PCHAR(*percent++); /* * Since we ignore a formatting argument it is no * longer safe to obey the remaining formatting * arguments as the arguments will no longer match * the format specs. */ stop = 1; break; } } #undef PCHAR } /* * Put character in log buffer with a particular priority. */ static void msglogchar(int c, int pri) { static int lastpri = -1; static int dangling; char nbuf[MAXNBUF]; char *p; if (!msgbufmapped) return; if (c == '\0' || c == '\r') return; if (pri != -1 && pri != lastpri) { if (dangling) { msgbuf_addchar(msgbufp, '\n'); dangling = 0; } msgbuf_addchar(msgbufp, '<'); for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;) msgbuf_addchar(msgbufp, *p--); msgbuf_addchar(msgbufp, '>'); lastpri = pri; } msgbuf_addchar(msgbufp, c); if (c == '\n') { dangling = 0; lastpri = -1; } else { dangling = 1; } } static void msglogstr(char *str, int pri, int filter_cr) { if (!msgbufmapped) return; msgbuf_addstr(msgbufp, pri, str, filter_cr); } void msgbufinit(void *ptr, int size) { char *cp; static struct msgbuf *oldp = NULL; bool print_boot_tag; size -= sizeof(*msgbufp); cp = (char *)ptr; print_boot_tag = !msgbufmapped; /* Attempt to fetch kern.boot_tag tunable on first mapping */ if (!msgbufmapped) TUNABLE_STR_FETCH("kern.boot_tag", current_boot_tag, sizeof(current_boot_tag)); msgbufp = (struct msgbuf *)(cp + size); msgbuf_reinit(msgbufp, cp, size); if (msgbufmapped && oldp != msgbufp) msgbuf_copy(oldp, msgbufp); msgbufmapped = true; if (print_boot_tag && *current_boot_tag != '\0') printf("%s\n", current_boot_tag); oldp = msgbufp; } /* Sysctls for accessing/clearing the msgbuf */ static int sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS) { char buf[128], *bp; u_int seq; int error, len; bool wrap; error = priv_check(req->td, PRIV_MSGBUF); if (error) return (error); /* Read the whole buffer, one chunk at a time. */ mtx_lock(&msgbuf_lock); msgbuf_peekbytes(msgbufp, NULL, 0, &seq); wrap = (seq != 0); for (;;) { len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq); mtx_unlock(&msgbuf_lock); if (len == 0) return (SYSCTL_OUT(req, "", 1)); /* add nulterm */ if (wrap) { /* Skip the first line, as it is probably incomplete. */ bp = memchr(buf, '\n', len); if (bp == NULL) { mtx_lock(&msgbuf_lock); continue; } wrap = false; bp++; len -= bp - buf; if (len == 0) { mtx_lock(&msgbuf_lock); continue; } } else bp = buf; error = sysctl_handle_opaque(oidp, bp, len, req); if (error) return (error); mtx_lock(&msgbuf_lock); } } SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer"); static int msgbuf_clearflag; static int sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { mtx_lock(&msgbuf_lock); msgbuf_clear(msgbufp); mtx_unlock(&msgbuf_lock); msgbuf_clearflag = 0; } return (error); } SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer"); #ifdef DDB -DB_SHOW_COMMAND(msgbuf, db_show_msgbuf) +DB_SHOW_COMMAND_FLAGS(msgbuf, db_show_msgbuf, DB_CMD_MEMSAFE) { int i, j; if (!msgbufmapped) { db_printf("msgbuf not mapped yet\n"); return; } db_printf("msgbufp = %p\n", msgbufp); db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n", msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq, msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum); for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) { j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq); db_printf("%c", msgbufp->msg_ptr[j]); } db_printf("\n"); } #endif /* DDB */ void hexdump(const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) printf("%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) printf("%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) printf("%c%02x", delim, cp[k]); else printf(" "); } } if ((flags & HD_OMIT_CHARS) == 0) { printf(" |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) printf(" "); else if (cp[k] >= ' ' && cp[k] <= '~') printf("%c", cp[k]); else printf("."); } printf("|"); } printf("\n"); } } #endif /* _KERNEL */ void sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) sbuf_printf(sb, "%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) sbuf_printf(sb, "%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) sbuf_printf(sb, "%c%02x", delim, cp[k]); else sbuf_printf(sb, " "); } } if ((flags & HD_OMIT_CHARS) == 0) { sbuf_printf(sb, " |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) sbuf_printf(sb, " "); else if (cp[k] >= ' ' && cp[k] <= '~') sbuf_printf(sb, "%c", cp[k]); else sbuf_printf(sb, "."); } sbuf_printf(sb, "|"); } sbuf_printf(sb, "\n"); } } #ifdef _KERNEL void counted_warning(unsigned *counter, const char *msg) { struct thread *td; unsigned c; for (;;) { c = *counter; if (c == 0) break; if (atomic_cmpset_int(counter, c, c - 1)) { td = curthread; log(LOG_INFO, "pid %d (%s) %s%s\n", td->td_proc->p_pid, td->td_name, msg, c > 1 ? "" : " - not logging anymore"); break; } } } #endif #ifdef _KERNEL void sbuf_putbuf(struct sbuf *sb) { prf_putbuf(sbuf_data(sb), TOLOG | TOCONS, -1); } #else void sbuf_putbuf(struct sbuf *sb) { printf("%s", sbuf_data(sb)); } #endif int sbuf_printf_drain(void *arg, const char *data, int len) { size_t *retvalptr; int r; #ifdef _KERNEL char *dataptr; char oldchr; /* * This is allowed as an extra byte is always resvered for * terminating NUL byte. Save and restore the byte because * we might be flushing a record, and there may be valid * data after the buffer. */ oldchr = data[len]; dataptr = __DECONST(char *, data); dataptr[len] = '\0'; prf_putbuf(dataptr, TOLOG | TOCONS, -1); r = len; dataptr[len] = oldchr; #else /* !_KERNEL */ r = printf("%.*s", len, data); if (r < 0) return (-errno); #endif retvalptr = arg; if (retvalptr != NULL) *retvalptr += r; return (r); } diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c index 1bbaff8264ef..32603d261279 100644 --- a/sys/kern/subr_rman.c +++ b/sys/kern/subr_rman.c @@ -1,1138 +1,1138 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The kernel resource manager. This code is responsible for keeping track * of hardware resources which are apportioned out to various drivers. * It does not actually assign those resources, and it is not expected * that end-device drivers will call into this code directly. Rather, * the code which implements the buses that those devices are attached to, * and the code which manages CPU resources, will call this code, and the * end-device drivers will make upcalls to that code to actually perform * the allocation. * * There are two sorts of resources managed by this code. The first is * the more familiar array (RMAN_ARRAY) type; resources in this class * consist of a sequence of individually-allocatable objects which have * been numbered in some well-defined order. Most of the resources * are of this type, as it is the most familiar. The second type is * called a gauge (RMAN_GAUGE), and models fungible resources (i.e., * resources in which each instance is indistinguishable from every * other instance). The principal anticipated application of gauges * is in the context of power consumption, where a bus may have a specific * power budget which all attached devices share. RMAN_GAUGE is not * implemented yet. * * For array resources, we make one simplifying assumption: two clients * sharing the same resource must use the same range of indices. That * is to say, sharing of overlapping-but-not-identical regions is not * permitted. */ #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* XXX debugging */ #include #include #include #ifdef DDB #include #endif /* * We use a linked list rather than a bitmap because we need to be able to * represent potentially huge objects (like all of a processor's physical * address space). */ struct resource_i { struct resource r_r; TAILQ_ENTRY(resource_i) r_link; LIST_ENTRY(resource_i) r_sharelink; LIST_HEAD(, resource_i) *r_sharehead; rman_res_t r_start; /* index of the first entry in this resource */ rman_res_t r_end; /* index of the last entry (inclusive) */ u_int r_flags; void *r_virtual; /* virtual address of this resource */ void *r_irq_cookie; /* interrupt cookie for this (interrupt) resource */ device_t r_dev; /* device which has allocated this resource */ struct rman *r_rm; /* resource manager from whence this came */ int r_rid; /* optional rid for this resource. */ }; static int rman_debug = 0; SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RWTUN, &rman_debug, 0, "rman debug"); #define DPRINTF(params) if (rman_debug) printf params static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager"); struct rman_head rman_head; static struct mtx rman_mtx; /* mutex to protect rman_head */ static int int_rman_release_resource(struct rman *rm, struct resource_i *r); static __inline struct resource_i * int_alloc_resource(int malloc_flag) { struct resource_i *r; r = malloc(sizeof *r, M_RMAN, malloc_flag | M_ZERO); if (r != NULL) { r->r_r.__r_i = r; } return (r); } int rman_init(struct rman *rm) { static int once = 0; if (once == 0) { once = 1; TAILQ_INIT(&rman_head); mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF); } if (rm->rm_start == 0 && rm->rm_end == 0) rm->rm_end = ~0; if (rm->rm_type == RMAN_UNINIT) panic("rman_init"); if (rm->rm_type == RMAN_GAUGE) panic("implement RMAN_GAUGE"); TAILQ_INIT(&rm->rm_list); rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO); if (rm->rm_mtx == NULL) return ENOMEM; mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF); mtx_lock(&rman_mtx); TAILQ_INSERT_TAIL(&rman_head, rm, rm_link); mtx_unlock(&rman_mtx); return 0; } int rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end) { struct resource_i *r, *s, *t; int rv = 0; DPRINTF(("rman_manage_region: <%s> request: start %#jx, end %#jx\n", rm->rm_descr, start, end)); if (start < rm->rm_start || end > rm->rm_end) return EINVAL; r = int_alloc_resource(M_NOWAIT); if (r == NULL) return ENOMEM; r->r_start = start; r->r_end = end; r->r_rm = rm; mtx_lock(rm->rm_mtx); /* Skip entries before us. */ TAILQ_FOREACH(s, &rm->rm_list, r_link) { if (s->r_end == ~0) break; if (s->r_end + 1 >= r->r_start) break; } /* If we ran off the end of the list, insert at the tail. */ if (s == NULL) { TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link); } else { /* Check for any overlap with the current region. */ if (r->r_start <= s->r_end && r->r_end >= s->r_start) { rv = EBUSY; goto out; } /* Check for any overlap with the next region. */ t = TAILQ_NEXT(s, r_link); if (t && r->r_start <= t->r_end && r->r_end >= t->r_start) { rv = EBUSY; goto out; } /* * See if this region can be merged with the next region. If * not, clear the pointer. */ if (t && (r->r_end + 1 != t->r_start || t->r_flags != 0)) t = NULL; /* See if we can merge with the current region. */ if (s->r_end + 1 == r->r_start && s->r_flags == 0) { /* Can we merge all 3 regions? */ if (t != NULL) { s->r_end = t->r_end; TAILQ_REMOVE(&rm->rm_list, t, r_link); free(r, M_RMAN); free(t, M_RMAN); } else { s->r_end = r->r_end; free(r, M_RMAN); } } else if (t != NULL) { /* Can we merge with just the next region? */ t->r_start = r->r_start; free(r, M_RMAN); } else if (s->r_end < r->r_start) { TAILQ_INSERT_AFTER(&rm->rm_list, s, r, r_link); } else { TAILQ_INSERT_BEFORE(s, r, r_link); } } out: mtx_unlock(rm->rm_mtx); return rv; } int rman_init_from_resource(struct rman *rm, struct resource *r) { int rv; if ((rv = rman_init(rm)) != 0) return (rv); return (rman_manage_region(rm, r->__r_i->r_start, r->__r_i->r_end)); } int rman_fini(struct rman *rm) { struct resource_i *r; mtx_lock(rm->rm_mtx); TAILQ_FOREACH(r, &rm->rm_list, r_link) { if (r->r_flags & RF_ALLOCATED) { mtx_unlock(rm->rm_mtx); return EBUSY; } } /* * There really should only be one of these if we are in this * state and the code is working properly, but it can't hurt. */ while (!TAILQ_EMPTY(&rm->rm_list)) { r = TAILQ_FIRST(&rm->rm_list); TAILQ_REMOVE(&rm->rm_list, r, r_link); free(r, M_RMAN); } mtx_unlock(rm->rm_mtx); mtx_lock(&rman_mtx); TAILQ_REMOVE(&rman_head, rm, rm_link); mtx_unlock(&rman_mtx); mtx_destroy(rm->rm_mtx); free(rm->rm_mtx, M_RMAN); return 0; } int rman_first_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end) { struct resource_i *r; mtx_lock(rm->rm_mtx); TAILQ_FOREACH(r, &rm->rm_list, r_link) { if (!(r->r_flags & RF_ALLOCATED)) { *start = r->r_start; *end = r->r_end; mtx_unlock(rm->rm_mtx); return (0); } } mtx_unlock(rm->rm_mtx); return (ENOENT); } int rman_last_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end) { struct resource_i *r; mtx_lock(rm->rm_mtx); TAILQ_FOREACH_REVERSE(r, &rm->rm_list, resource_head, r_link) { if (!(r->r_flags & RF_ALLOCATED)) { *start = r->r_start; *end = r->r_end; mtx_unlock(rm->rm_mtx); return (0); } } mtx_unlock(rm->rm_mtx); return (ENOENT); } /* Shrink or extend one or both ends of an allocated resource. */ int rman_adjust_resource(struct resource *rr, rman_res_t start, rman_res_t end) { struct resource_i *r, *s, *t, *new; struct rman *rm; /* Not supported for shared resources. */ r = rr->__r_i; if (r->r_flags & RF_SHAREABLE) return (EINVAL); /* * This does not support wholesale moving of a resource. At * least part of the desired new range must overlap with the * existing resource. */ if (end < r->r_start || r->r_end < start) return (EINVAL); /* * Find the two resource regions immediately adjacent to the * allocated resource. */ rm = r->r_rm; mtx_lock(rm->rm_mtx); #ifdef INVARIANTS TAILQ_FOREACH(s, &rm->rm_list, r_link) { if (s == r) break; } if (s == NULL) panic("resource not in list"); #endif s = TAILQ_PREV(r, resource_head, r_link); t = TAILQ_NEXT(r, r_link); KASSERT(s == NULL || s->r_end + 1 == r->r_start, ("prev resource mismatch")); KASSERT(t == NULL || r->r_end + 1 == t->r_start, ("next resource mismatch")); /* * See if the changes are permitted. Shrinking is always allowed, * but growing requires sufficient room in the adjacent region. */ if (start < r->r_start && (s == NULL || (s->r_flags & RF_ALLOCATED) || s->r_start > start)) { mtx_unlock(rm->rm_mtx); return (EBUSY); } if (end > r->r_end && (t == NULL || (t->r_flags & RF_ALLOCATED) || t->r_end < end)) { mtx_unlock(rm->rm_mtx); return (EBUSY); } /* * While holding the lock, grow either end of the resource as * needed and shrink either end if the shrinking does not require * allocating a new resource. We can safely drop the lock and then * insert a new range to handle the shrinking case afterwards. */ if (start < r->r_start || (start > r->r_start && s != NULL && !(s->r_flags & RF_ALLOCATED))) { KASSERT(s->r_flags == 0, ("prev is busy")); r->r_start = start; if (s->r_start == start) { TAILQ_REMOVE(&rm->rm_list, s, r_link); free(s, M_RMAN); } else s->r_end = start - 1; } if (end > r->r_end || (end < r->r_end && t != NULL && !(t->r_flags & RF_ALLOCATED))) { KASSERT(t->r_flags == 0, ("next is busy")); r->r_end = end; if (t->r_end == end) { TAILQ_REMOVE(&rm->rm_list, t, r_link); free(t, M_RMAN); } else t->r_start = end + 1; } mtx_unlock(rm->rm_mtx); /* * Handle the shrinking cases that require allocating a new * resource to hold the newly-free region. We have to recheck * if we still need this new region after acquiring the lock. */ if (start > r->r_start) { new = int_alloc_resource(M_WAITOK); new->r_start = r->r_start; new->r_end = start - 1; new->r_rm = rm; mtx_lock(rm->rm_mtx); r->r_start = start; s = TAILQ_PREV(r, resource_head, r_link); if (s != NULL && !(s->r_flags & RF_ALLOCATED)) { s->r_end = start - 1; free(new, M_RMAN); } else TAILQ_INSERT_BEFORE(r, new, r_link); mtx_unlock(rm->rm_mtx); } if (end < r->r_end) { new = int_alloc_resource(M_WAITOK); new->r_start = end + 1; new->r_end = r->r_end; new->r_rm = rm; mtx_lock(rm->rm_mtx); r->r_end = end; t = TAILQ_NEXT(r, r_link); if (t != NULL && !(t->r_flags & RF_ALLOCATED)) { t->r_start = end + 1; free(new, M_RMAN); } else TAILQ_INSERT_AFTER(&rm->rm_list, r, new, r_link); mtx_unlock(rm->rm_mtx); } return (0); } #define SHARE_TYPE(f) (f & (RF_SHAREABLE | RF_PREFETCHABLE)) struct resource * rman_reserve_resource_bound(struct rman *rm, rman_res_t start, rman_res_t end, rman_res_t count, rman_res_t bound, u_int flags, device_t dev) { u_int new_rflags; struct resource_i *r, *s, *rv; rman_res_t rstart, rend, amask, bmask; rv = NULL; DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#jx, %#jx], " "length %#jx, flags %x, device %s\n", rm->rm_descr, start, end, count, flags, dev == NULL ? "" : device_get_nameunit(dev))); KASSERT(count != 0, ("%s: attempted to allocate an empty range", __func__)); KASSERT((flags & RF_FIRSTSHARE) == 0, ("invalid flags %#x", flags)); new_rflags = (flags & ~RF_FIRSTSHARE) | RF_ALLOCATED; mtx_lock(rm->rm_mtx); r = TAILQ_FIRST(&rm->rm_list); if (r == NULL) { DPRINTF(("NULL list head\n")); } else { DPRINTF(("rman_reserve_resource_bound: trying %#jx <%#jx,%#jx>\n", r->r_end, start, count-1)); } for (r = TAILQ_FIRST(&rm->rm_list); r && r->r_end < start + count - 1; r = TAILQ_NEXT(r, r_link)) { ; DPRINTF(("rman_reserve_resource_bound: tried %#jx <%#jx,%#jx>\n", r->r_end, start, count-1)); } if (r == NULL) { DPRINTF(("could not find a region\n")); goto out; } amask = (1ull << RF_ALIGNMENT(flags)) - 1; KASSERT(start <= RM_MAX_END - amask, ("start (%#jx) + amask (%#jx) would wrap around", start, amask)); /* If bound is 0, bmask will also be 0 */ bmask = ~(bound - 1); /* * First try to find an acceptable totally-unshared region. */ for (s = r; s; s = TAILQ_NEXT(s, r_link)) { DPRINTF(("considering [%#jx, %#jx]\n", s->r_start, s->r_end)); /* * The resource list is sorted, so there is no point in * searching further once r_start is too large. */ if (s->r_start > end - (count - 1)) { DPRINTF(("s->r_start (%#jx) + count - 1> end (%#jx)\n", s->r_start, end)); break; } if (s->r_start > RM_MAX_END - amask) { DPRINTF(("s->r_start (%#jx) + amask (%#jx) too large\n", s->r_start, amask)); break; } if (s->r_flags & RF_ALLOCATED) { DPRINTF(("region is allocated\n")); continue; } rstart = ummax(s->r_start, start); /* * Try to find a region by adjusting to boundary and alignment * until both conditions are satisfied. This is not an optimal * algorithm, but in most cases it isn't really bad, either. */ do { rstart = (rstart + amask) & ~amask; if (((rstart ^ (rstart + count - 1)) & bmask) != 0) rstart += bound - (rstart & ~bmask); } while ((rstart & amask) != 0 && rstart < end && rstart < s->r_end); rend = ummin(s->r_end, ummax(rstart + count - 1, end)); if (rstart > rend) { DPRINTF(("adjusted start exceeds end\n")); continue; } DPRINTF(("truncated region: [%#jx, %#jx]; size %#jx (requested %#jx)\n", rstart, rend, (rend - rstart + 1), count)); if ((rend - rstart) >= (count - 1)) { DPRINTF(("candidate region: [%#jx, %#jx], size %#jx\n", rstart, rend, (rend - rstart + 1))); if ((s->r_end - s->r_start + 1) == count) { DPRINTF(("candidate region is entire chunk\n")); rv = s; rv->r_flags = new_rflags; rv->r_dev = dev; goto out; } /* * If s->r_start < rstart and * s->r_end > rstart + count - 1, then * we need to split the region into three pieces * (the middle one will get returned to the user). * Otherwise, we are allocating at either the * beginning or the end of s, so we only need to * split it in two. The first case requires * two new allocations; the second requires but one. */ rv = int_alloc_resource(M_NOWAIT); if (rv == NULL) goto out; rv->r_start = rstart; rv->r_end = rstart + count - 1; rv->r_flags = new_rflags; rv->r_dev = dev; rv->r_rm = rm; if (s->r_start < rv->r_start && s->r_end > rv->r_end) { DPRINTF(("splitting region in three parts: " "[%#jx, %#jx]; [%#jx, %#jx]; [%#jx, %#jx]\n", s->r_start, rv->r_start - 1, rv->r_start, rv->r_end, rv->r_end + 1, s->r_end)); /* * We are allocating in the middle. */ r = int_alloc_resource(M_NOWAIT); if (r == NULL) { free(rv, M_RMAN); rv = NULL; goto out; } r->r_start = rv->r_end + 1; r->r_end = s->r_end; r->r_flags = s->r_flags; r->r_rm = rm; s->r_end = rv->r_start - 1; TAILQ_INSERT_AFTER(&rm->rm_list, s, rv, r_link); TAILQ_INSERT_AFTER(&rm->rm_list, rv, r, r_link); } else if (s->r_start == rv->r_start) { DPRINTF(("allocating from the beginning\n")); /* * We are allocating at the beginning. */ s->r_start = rv->r_end + 1; TAILQ_INSERT_BEFORE(s, rv, r_link); } else { DPRINTF(("allocating at the end\n")); /* * We are allocating at the end. */ s->r_end = rv->r_start - 1; TAILQ_INSERT_AFTER(&rm->rm_list, s, rv, r_link); } goto out; } } /* * Now find an acceptable shared region, if the client's requirements * allow sharing. By our implementation restriction, a candidate * region must match exactly by both size and sharing type in order * to be considered compatible with the client's request. (The * former restriction could probably be lifted without too much * additional work, but this does not seem warranted.) */ DPRINTF(("no unshared regions found\n")); if ((flags & RF_SHAREABLE) == 0) goto out; for (s = r; s && s->r_end <= end; s = TAILQ_NEXT(s, r_link)) { if (SHARE_TYPE(s->r_flags) == SHARE_TYPE(flags) && s->r_start >= start && (s->r_end - s->r_start + 1) == count && (s->r_start & amask) == 0 && ((s->r_start ^ s->r_end) & bmask) == 0) { rv = int_alloc_resource(M_NOWAIT); if (rv == NULL) goto out; rv->r_start = s->r_start; rv->r_end = s->r_end; rv->r_flags = new_rflags; rv->r_dev = dev; rv->r_rm = rm; if (s->r_sharehead == NULL) { s->r_sharehead = malloc(sizeof *s->r_sharehead, M_RMAN, M_NOWAIT | M_ZERO); if (s->r_sharehead == NULL) { free(rv, M_RMAN); rv = NULL; goto out; } LIST_INIT(s->r_sharehead); LIST_INSERT_HEAD(s->r_sharehead, s, r_sharelink); s->r_flags |= RF_FIRSTSHARE; } rv->r_sharehead = s->r_sharehead; LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink); goto out; } } /* * We couldn't find anything. */ out: mtx_unlock(rm->rm_mtx); return (rv == NULL ? NULL : &rv->r_r); } struct resource * rman_reserve_resource(struct rman *rm, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags, device_t dev) { return (rman_reserve_resource_bound(rm, start, end, count, 0, flags, dev)); } int rman_activate_resource(struct resource *re) { struct resource_i *r; struct rman *rm; r = re->__r_i; rm = r->r_rm; mtx_lock(rm->rm_mtx); r->r_flags |= RF_ACTIVE; mtx_unlock(rm->rm_mtx); return 0; } int rman_deactivate_resource(struct resource *r) { struct rman *rm; rm = r->__r_i->r_rm; mtx_lock(rm->rm_mtx); r->__r_i->r_flags &= ~RF_ACTIVE; mtx_unlock(rm->rm_mtx); return 0; } static int int_rman_release_resource(struct rman *rm, struct resource_i *r) { struct resource_i *s, *t; if (r->r_flags & RF_ACTIVE) r->r_flags &= ~RF_ACTIVE; /* * Check for a sharing list first. If there is one, then we don't * have to think as hard. */ if (r->r_sharehead) { /* * If a sharing list exists, then we know there are at * least two sharers. * * If we are in the main circleq, appoint someone else. */ LIST_REMOVE(r, r_sharelink); s = LIST_FIRST(r->r_sharehead); if (r->r_flags & RF_FIRSTSHARE) { s->r_flags |= RF_FIRSTSHARE; TAILQ_INSERT_BEFORE(r, s, r_link); TAILQ_REMOVE(&rm->rm_list, r, r_link); } /* * Make sure that the sharing list goes away completely * if the resource is no longer being shared at all. */ if (LIST_NEXT(s, r_sharelink) == NULL) { free(s->r_sharehead, M_RMAN); s->r_sharehead = NULL; s->r_flags &= ~RF_FIRSTSHARE; } goto out; } /* * Look at the adjacent resources in the list and see if our * segment can be merged with any of them. If either of the * resources is allocated or is not exactly adjacent then they * cannot be merged with our segment. */ s = TAILQ_PREV(r, resource_head, r_link); if (s != NULL && ((s->r_flags & RF_ALLOCATED) != 0 || s->r_end + 1 != r->r_start)) s = NULL; t = TAILQ_NEXT(r, r_link); if (t != NULL && ((t->r_flags & RF_ALLOCATED) != 0 || r->r_end + 1 != t->r_start)) t = NULL; if (s != NULL && t != NULL) { /* * Merge all three segments. */ s->r_end = t->r_end; TAILQ_REMOVE(&rm->rm_list, r, r_link); TAILQ_REMOVE(&rm->rm_list, t, r_link); free(t, M_RMAN); } else if (s != NULL) { /* * Merge previous segment with ours. */ s->r_end = r->r_end; TAILQ_REMOVE(&rm->rm_list, r, r_link); } else if (t != NULL) { /* * Merge next segment with ours. */ t->r_start = r->r_start; TAILQ_REMOVE(&rm->rm_list, r, r_link); } else { /* * At this point, we know there is nothing we * can potentially merge with, because on each * side, there is either nothing there or what is * there is still allocated. In that case, we don't * want to remove r from the list; we simply want to * change it to an unallocated region and return * without freeing anything. */ r->r_flags &= ~RF_ALLOCATED; r->r_dev = NULL; return 0; } out: free(r, M_RMAN); return 0; } int rman_release_resource(struct resource *re) { int rv; struct resource_i *r; struct rman *rm; r = re->__r_i; rm = r->r_rm; mtx_lock(rm->rm_mtx); rv = int_rman_release_resource(rm, r); mtx_unlock(rm->rm_mtx); return (rv); } uint32_t rman_make_alignment_flags(uint32_t size) { int i; /* * Find the hightest bit set, and add one if more than one bit * set. We're effectively computing the ceil(log2(size)) here. */ for (i = 31; i > 0; i--) if ((1 << i) & size) break; if (~(1 << i) & size) i++; return(RF_ALIGNMENT_LOG2(i)); } void rman_set_start(struct resource *r, rman_res_t start) { r->__r_i->r_start = start; } rman_res_t rman_get_start(struct resource *r) { return (r->__r_i->r_start); } void rman_set_end(struct resource *r, rman_res_t end) { r->__r_i->r_end = end; } rman_res_t rman_get_end(struct resource *r) { return (r->__r_i->r_end); } rman_res_t rman_get_size(struct resource *r) { return (r->__r_i->r_end - r->__r_i->r_start + 1); } u_int rman_get_flags(struct resource *r) { return (r->__r_i->r_flags); } void rman_set_virtual(struct resource *r, void *v) { r->__r_i->r_virtual = v; } void * rman_get_virtual(struct resource *r) { return (r->__r_i->r_virtual); } void rman_set_irq_cookie(struct resource *r, void *c) { r->__r_i->r_irq_cookie = c; } void * rman_get_irq_cookie(struct resource *r) { return (r->__r_i->r_irq_cookie); } void rman_set_bustag(struct resource *r, bus_space_tag_t t) { r->r_bustag = t; } bus_space_tag_t rman_get_bustag(struct resource *r) { return (r->r_bustag); } void rman_set_bushandle(struct resource *r, bus_space_handle_t h) { r->r_bushandle = h; } bus_space_handle_t rman_get_bushandle(struct resource *r) { return (r->r_bushandle); } void rman_set_mapping(struct resource *r, struct resource_map *map) { KASSERT(rman_get_size(r) == map->r_size, ("rman_set_mapping: size mismatch")); rman_set_bustag(r, map->r_bustag); rman_set_bushandle(r, map->r_bushandle); rman_set_virtual(r, map->r_vaddr); } void rman_get_mapping(struct resource *r, struct resource_map *map) { map->r_bustag = rman_get_bustag(r); map->r_bushandle = rman_get_bushandle(r); map->r_size = rman_get_size(r); map->r_vaddr = rman_get_virtual(r); } void rman_set_rid(struct resource *r, int rid) { r->__r_i->r_rid = rid; } int rman_get_rid(struct resource *r) { return (r->__r_i->r_rid); } void rman_set_device(struct resource *r, device_t dev) { r->__r_i->r_dev = dev; } device_t rman_get_device(struct resource *r) { return (r->__r_i->r_dev); } int rman_is_region_manager(struct resource *r, struct rman *rm) { return (r->__r_i->r_rm == rm); } /* * Sysctl interface for scanning the resource lists. * * We take two input parameters; the index into the list of resource * managers, and the resource offset into the list. */ static int sysctl_rman(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; int rman_idx, res_idx; struct rman *rm; struct resource_i *res; struct resource_i *sres; struct u_rman urm; struct u_resource ures; int error; if (namelen != 3) return (EINVAL); if (bus_data_generation_check(name[0])) return (EINVAL); rman_idx = name[1]; res_idx = name[2]; /* * Find the indexed resource manager */ mtx_lock(&rman_mtx); TAILQ_FOREACH(rm, &rman_head, rm_link) { if (rman_idx-- == 0) break; } mtx_unlock(&rman_mtx); if (rm == NULL) return (ENOENT); /* * If the resource index is -1, we want details on the * resource manager. */ if (res_idx == -1) { bzero(&urm, sizeof(urm)); urm.rm_handle = (uintptr_t)rm; if (rm->rm_descr != NULL) strlcpy(urm.rm_descr, rm->rm_descr, RM_TEXTLEN); urm.rm_start = rm->rm_start; urm.rm_size = rm->rm_end - rm->rm_start + 1; urm.rm_type = rm->rm_type; error = SYSCTL_OUT(req, &urm, sizeof(urm)); return (error); } /* * Find the indexed resource and return it. */ mtx_lock(rm->rm_mtx); TAILQ_FOREACH(res, &rm->rm_list, r_link) { if (res->r_sharehead != NULL) { LIST_FOREACH(sres, res->r_sharehead, r_sharelink) if (res_idx-- == 0) { res = sres; goto found; } } else if (res_idx-- == 0) goto found; } mtx_unlock(rm->rm_mtx); return (ENOENT); found: bzero(&ures, sizeof(ures)); ures.r_handle = (uintptr_t)res; ures.r_parent = (uintptr_t)res->r_rm; ures.r_device = (uintptr_t)res->r_dev; if (res->r_dev != NULL) { if (device_get_name(res->r_dev) != NULL) { snprintf(ures.r_devname, RM_TEXTLEN, "%s%d", device_get_name(res->r_dev), device_get_unit(res->r_dev)); } else { strlcpy(ures.r_devname, "nomatch", RM_TEXTLEN); } } else { ures.r_devname[0] = '\0'; } ures.r_start = res->r_start; ures.r_size = res->r_end - res->r_start + 1; ures.r_flags = res->r_flags; mtx_unlock(rm->rm_mtx); error = SYSCTL_OUT(req, &ures, sizeof(ures)); return (error); } static SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_rman, "kernel resource manager"); #ifdef DDB static void dump_rman_header(struct rman *rm) { if (db_pager_quit) return; db_printf("rman %p: %s (0x%jx-0x%jx full range)\n", rm, rm->rm_descr, (rman_res_t)rm->rm_start, (rman_res_t)rm->rm_end); } static void dump_rman(struct rman *rm) { struct resource_i *r; const char *devname; if (db_pager_quit) return; TAILQ_FOREACH(r, &rm->rm_list, r_link) { if (r->r_dev != NULL) { devname = device_get_nameunit(r->r_dev); if (devname == NULL) devname = "nomatch"; } else devname = NULL; db_printf(" 0x%jx-0x%jx (RID=%d) ", r->r_start, r->r_end, r->r_rid); if (devname != NULL) db_printf("(%s)\n", devname); else db_printf("----\n"); if (db_pager_quit) return; } } DB_SHOW_COMMAND(rman, db_show_rman) { if (have_addr) { dump_rman_header((struct rman *)addr); dump_rman((struct rman *)addr); } } -DB_SHOW_COMMAND(rmans, db_show_rmans) +DB_SHOW_COMMAND_FLAGS(rmans, db_show_rmans, DB_CMD_MEMSAFE) { struct rman *rm; TAILQ_FOREACH(rm, &rman_head, rm_link) { dump_rman_header(rm); } } DB_SHOW_ALL_COMMAND(rman, db_show_all_rman) { struct rman *rm; TAILQ_FOREACH(rm, &rman_head, rm_link) { dump_rman_header(rm); dump_rman(rm); } } -DB_SHOW_ALIAS(allrman, db_show_all_rman); +DB_SHOW_ALIAS_FLAGS(allrman, db_show_all_rman, DB_CMD_MEMSAFE); #endif diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c index d966a796c867..d976d0994668 100644 --- a/sys/kern/subr_turnstile.c +++ b/sys/kern/subr_turnstile.c @@ -1,1329 +1,1329 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of turnstiles used to hold queue of threads blocked on * non-sleepable locks. Sleepable locks use condition variables to * implement their queues. Turnstiles differ from a sleep queue in that * turnstile queue's are assigned to a lock held by an owning thread. Thus, * when one thread is enqueued onto a turnstile, it can lend its priority * to the owning thread. * * We wish to avoid bloating locks with an embedded turnstile and we do not * want to use back-pointers in the locks for the same reason. Thus, we * use a similar approach to that of Solaris 7 as described in Solaris * Internals by Jim Mauro and Richard McDougall. Turnstiles are looked up * in a hash table based on the address of the lock. Each entry in the * hash table is a linked-lists of turnstiles and is called a turnstile * chain. Each chain contains a spin mutex that protects all of the * turnstiles in the chain. * * Each time a thread is created, a turnstile is allocated from a UMA zone * and attached to that thread. When a thread blocks on a lock, if it is the * first thread to block, it lends its turnstile to the lock. If the lock * already has a turnstile, then it gives its turnstile to the lock's * turnstile's free list. When a thread is woken up, it takes a turnstile from * the free list if there are any other waiters. If it is the only thread * blocked on the lock, then it reclaims the turnstile associated with the lock * and removes it from the hash table. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_turnstile_profiling.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif /* * Constants for the hash table of turnstile chains. TC_SHIFT is a magic * number chosen because the sleep queue's use the same value for the * shift. Basically, we ignore the lower 8 bits of the address. * TC_TABLESIZE must be a power of two for TC_MASK to work properly. */ #define TC_TABLESIZE 128 /* Must be power of 2. */ #define TC_MASK (TC_TABLESIZE - 1) #define TC_SHIFT 8 #define TC_HASH(lock) (((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK) #define TC_LOOKUP(lock) &turnstile_chains[TC_HASH(lock)] /* * There are three different lists of turnstiles as follows. The list * connected by ts_link entries is a per-thread list of all the turnstiles * attached to locks that we own. This is used to fixup our priority when * a lock is released. The other two lists use the ts_hash entries. The * first of these two is the turnstile chain list that a turnstile is on * when it is attached to a lock. The second list to use ts_hash is the * free list hung off of a turnstile that is attached to a lock. * * Each turnstile contains three lists of threads. The two ts_blocked lists * are linked list of threads blocked on the turnstile's lock. One list is * for exclusive waiters, and the other is for shared waiters. The * ts_pending list is a linked list of threads previously awakened by * turnstile_signal() or turnstile_wait() that are waiting to be put on * the run queue. * * Locking key: * c - turnstile chain lock * q - td_contested lock */ struct turnstile { struct mtx ts_lock; /* Spin lock for self. */ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */ struct threadqueue ts_pending; /* (c) Pending threads. */ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */ LIST_ENTRY(turnstile) ts_link; /* (q) Contested locks. */ LIST_HEAD(, turnstile) ts_free; /* (c) Free turnstiles. */ struct lock_object *ts_lockobj; /* (c) Lock we reference. */ struct thread *ts_owner; /* (c + q) Who owns the lock. */ }; struct turnstile_chain { LIST_HEAD(, turnstile) tc_turnstiles; /* List of turnstiles. */ struct mtx tc_lock; /* Spin lock for this chain. */ #ifdef TURNSTILE_PROFILING u_int tc_depth; /* Length of tc_queues. */ u_int tc_max_depth; /* Max length of tc_queues. */ #endif }; #ifdef TURNSTILE_PROFILING u_int turnstile_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "turnstile profiling"); static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "turnstile chain stats"); SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD, &turnstile_max_depth, 0, "maximum depth achieved of a single chain"); #endif static struct mtx td_contested_lock; static struct turnstile_chain turnstile_chains[TC_TABLESIZE]; static uma_zone_t turnstile_zone; /* * Prototypes for non-exported routines. */ static void init_turnstile0(void *dummy); #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg); #endif static void propagate_priority(struct thread *td); static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td); static struct thread *turnstile_first_waiter(struct turnstile *ts); static void turnstile_setowner(struct turnstile *ts, struct thread *owner); #ifdef INVARIANTS static void turnstile_dtor(void *mem, int size, void *arg); #endif static int turnstile_init(void *mem, int size, int flags); static void turnstile_fini(void *mem, int size); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , sleep); SDT_PROBE_DEFINE2(sched, , , wakeup, "struct thread *", "struct proc *"); static inline void propagate_unlock_ts(struct turnstile *top, struct turnstile *ts) { if (ts != top) mtx_unlock_spin(&ts->ts_lock); } static inline void propagate_unlock_td(struct turnstile *top, struct thread *td) { if (td->td_lock != &top->ts_lock) thread_unlock(td); } /* * Walks the chain of turnstiles and their owners to propagate the priority * of the thread being blocked to all the threads holding locks that have to * release their locks before this thread can run again. */ static void propagate_priority(struct thread *td) { struct turnstile *ts, *top; int pri; THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; top = ts = td->td_blocked; THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* * The original turnstile lock is held across the entire * operation. We only ever lock down the chain so the lock * order is constant. */ for (;;) { td = ts->ts_owner; if (td == NULL) { /* * This might be a read lock with no owner. There's * not much we can do, so just bail. */ propagate_unlock_ts(top, ts); return; } /* * Wait for the thread lock to be stable and then only * acquire if it is not the turnstile lock. */ thread_lock_block_wait(td); if (td->td_lock != &ts->ts_lock) { thread_lock_flags(td, MTX_DUPOK); propagate_unlock_ts(top, ts); } MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); /* * If the thread is asleep, then we are probably about * to deadlock. To make debugging this easier, show * backtrace of misbehaving thread and panic to not * leave the kernel deadlocked. */ if (TD_IS_SLEEPING(td)) { printf( "Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n", td->td_tid, td->td_proc->p_pid); kdb_backtrace_thread(td); panic("sleeping thread"); } /* * If this thread already has higher priority than the * thread that is being blocked, we are finished. */ if (td->td_priority <= pri) { propagate_unlock_td(top, td); return; } /* * Bump this thread's priority. */ sched_lend_prio(td, pri); /* * If lock holder is actually running or on the run queue * then we are done. */ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) { MPASS(td->td_blocked == NULL); propagate_unlock_td(top, td); return; } #ifndef SMP /* * For UP, we check to see if td is curthread (this shouldn't * ever happen however as it would mean we are in a deadlock.) */ KASSERT(td != curthread, ("Deadlock detected")); #endif /* * If we aren't blocked on a lock, we should be. */ KASSERT(TD_ON_LOCK(td), ( "thread %d(%s):%d holds %s but isn't blocked on a lock\n", td->td_tid, td->td_name, TD_GET_STATE(td), ts->ts_lockobj->lo_name)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* Resort td on the list if needed. */ if (!turnstile_adjust_thread(ts, td)) { propagate_unlock_ts(top, ts); return; } /* The thread lock is released as ts lock above. */ } } /* * Adjust the thread's position on a turnstile after its priority has been * changed. */ static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td) { struct thread *td1, *td2; int queue; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* * This thread may not be blocked on this turnstile anymore * but instead might already be woken up on another CPU * that is waiting on the thread lock in turnstile_unpend() to * finish waking this thread up. We can detect this case * by checking to see if this thread has been given a * turnstile by either turnstile_signal() or * turnstile_broadcast(). In this case, treat the thread as * if it was already running. */ if (td->td_turnstile != NULL) return (0); /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); td1 = TAILQ_PREV(td, threadqueue, td_lockq); td2 = TAILQ_NEXT(td, td_lockq); if ((td1 != NULL && td->td_priority < td1->td_priority) || (td2 != NULL && td->td_priority > td2->td_priority)) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ queue = td->td_tsqueue; MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) { MPASS(td1->td_proc->p_magic == P_MAGIC); if (td1->td_priority > td->td_priority) break; } if (td1 == NULL) TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); else TAILQ_INSERT_BEFORE(td1, td, td_lockq); mtx_unlock_spin(&td_contested_lock); if (td1 == NULL) CTR3(KTR_LOCK, "turnstile_adjust_thread: td %d put at tail on [%p] %s", td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); else CTR4(KTR_LOCK, "turnstile_adjust_thread: td %d moved before %d on [%p] %s", td->td_tid, td1->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); } return (1); } /* * Early initialization of turnstiles. This is not done via a SYSINIT() * since this needs to be initialized very early when mutexes are first * initialized. */ void init_turnstiles(void) { int i; for (i = 0; i < TC_TABLESIZE; i++) { LIST_INIT(&turnstile_chains[i].tc_turnstiles); mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain", NULL, MTX_SPIN); } mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN); LIST_INIT(&thread0.td_contested); thread0.td_turnstile = NULL; } #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < TC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO, chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "turnstile chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth, 0, NULL); } } SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile_profiling, NULL); #endif static void init_turnstile0(void *dummy) { turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile), NULL, #ifdef INVARIANTS turnstile_dtor, #else NULL, #endif turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); thread0.td_turnstile = turnstile_alloc(); } SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL); /* * Update a thread on the turnstile list after it's priority has been changed. * The old priority is passed in as an argument. */ void turnstile_adjust(struct thread *td, u_char oldpri) { struct turnstile *ts; MPASS(TD_ON_LOCK(td)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); mtx_assert(&ts->ts_lock, MA_OWNED); /* Resort the turnstile on the list. */ if (!turnstile_adjust_thread(ts, td)) return; /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. * Note that we currently don't try to revoke lent priorities * when our priority goes up. */ MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE || td->td_tsqueue == TS_SHARED_QUEUE); if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) && td->td_priority < oldpri) { propagate_priority(td); } } /* * Set the owner of the lock this turnstile is attached to. */ static void turnstile_setowner(struct turnstile *ts, struct thread *owner) { mtx_assert(&td_contested_lock, MA_OWNED); MPASS(ts->ts_owner == NULL); /* A shared lock might not have an owner. */ if (owner == NULL) return; MPASS(owner->td_proc->p_magic == P_MAGIC); ts->ts_owner = owner; LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void turnstile_dtor(void *mem, int size, void *arg) { struct turnstile *ts; ts = mem; MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_pending)); } #endif /* * UMA zone item initializer. */ static int turnstile_init(void *mem, int size, int flags) { struct turnstile *ts; bzero(mem, size); ts = mem; TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]); TAILQ_INIT(&ts->ts_pending); LIST_INIT(&ts->ts_free); mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN); return (0); } static void turnstile_fini(void *mem, int size) { struct turnstile *ts; ts = mem; mtx_destroy(&ts->ts_lock); } /* * Get a turnstile for a new thread. */ struct turnstile * turnstile_alloc(void) { return (uma_zalloc(turnstile_zone, M_WAITOK)); } /* * Free a turnstile when a thread is destroyed. */ void turnstile_free(struct turnstile *ts) { uma_zfree(turnstile_zone, ts); } /* * Lock the turnstile chain associated with the specified lock. */ void turnstile_chain_lock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); } struct turnstile * turnstile_trywait(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } ts = curthread->td_turnstile; MPASS(ts != NULL); mtx_lock_spin(&ts->ts_lock); KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); ts->ts_lockobj = lock; return (ts); } bool turnstile_lock(struct turnstile *ts, struct lock_object **lockp, struct thread **tdp) { struct turnstile_chain *tc; struct lock_object *lock; if ((lock = ts->ts_lockobj) == NULL) return (false); tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); mtx_lock_spin(&ts->ts_lock); if (__predict_false(lock != ts->ts_lockobj)) { mtx_unlock_spin(&tc->tc_lock); mtx_unlock_spin(&ts->ts_lock); return (false); } *lockp = lock; *tdp = ts->ts_owner; return (true); } void turnstile_unlock(struct turnstile *ts, struct lock_object *lock) { struct turnstile_chain *tc; mtx_assert(&ts->ts_lock, MA_OWNED); mtx_unlock_spin(&ts->ts_lock); if (ts == curthread->td_turnstile) ts->ts_lockobj = NULL; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } void turnstile_assert(struct turnstile *ts) { MPASS(ts->ts_lockobj == NULL); } void turnstile_cancel(struct turnstile *ts) { struct turnstile_chain *tc; struct lock_object *lock; mtx_assert(&ts->ts_lock, MA_OWNED); mtx_unlock_spin(&ts->ts_lock); lock = ts->ts_lockobj; if (ts == curthread->td_turnstile) ts->ts_lockobj = NULL; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Look up the turnstile for a lock in the hash table locking the associated * turnstile chain along the way. If no turnstile is found in the hash * table, NULL is returned. */ struct turnstile * turnstile_lookup(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_assert(&tc->tc_lock, MA_OWNED); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } return (NULL); } /* * Unlock the turnstile chain associated with a given lock. */ void turnstile_chain_unlock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Return a pointer to the thread waiting on this turnstile with the * most important priority or NULL if the turnstile has no waiters. */ static struct thread * turnstile_first_waiter(struct turnstile *ts) { struct thread *std, *xtd; std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]); xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority)) return (std); return (xtd); } /* * Take ownership of a turnstile and adjust the priority of the new * owner appropriately. */ void turnstile_claim(struct turnstile *ts) { struct thread *td, *owner; struct turnstile_chain *tc; mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts != curthread->td_turnstile); owner = curthread; mtx_lock_spin(&td_contested_lock); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); td = turnstile_first_waiter(ts); MPASS(td != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); /* * Update the priority of the new owner if needed. */ thread_lock(owner); if (td->td_priority < owner->td_priority) sched_lend_prio(owner, td->td_priority); thread_unlock(owner); tc = TC_LOOKUP(ts->ts_lockobj); mtx_unlock_spin(&ts->ts_lock); mtx_unlock_spin(&tc->tc_lock); } /* * Block the current thread on the turnstile assicated with 'lock'. This * function will context switch and not return until this thread has been * woken back up. This function must be called with the appropriate * turnstile chain locked and will return with it unlocked. */ void turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) { struct turnstile_chain *tc; struct thread *td, *td1; struct lock_object *lock; td = curthread; mtx_assert(&ts->ts_lock, MA_OWNED); if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * If the lock does not already have a turnstile, use this thread's * turnstile. Otherwise insert the current thread into the * turnstile already in use by this lock. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); if (ts == td->td_turnstile) { #ifdef TURNSTILE_PROFILING tc->tc_depth++; if (tc->tc_depth > tc->tc_max_depth) { tc->tc_max_depth = tc->tc_depth; if (tc->tc_max_depth > turnstile_max_depth) turnstile_max_depth = tc->tc_max_depth; } #endif LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash); KASSERT(TAILQ_EMPTY(&ts->ts_pending), ("thread's turnstile has pending threads")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]), ("thread's turnstile has exclusive waiters")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]), ("thread's turnstile has shared waiters")); KASSERT(LIST_EMPTY(&ts->ts_free), ("thread's turnstile has a non-empty free list")); MPASS(ts->ts_lockobj != NULL); mtx_lock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); } else { TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) if (td1->td_priority > td->td_priority) break; mtx_lock_spin(&td_contested_lock); if (td1 != NULL) TAILQ_INSERT_BEFORE(td1, td, td_lockq); else TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); MPASS(owner == ts->ts_owner); mtx_unlock_spin(&td_contested_lock); MPASS(td->td_turnstile != NULL); LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash); } thread_lock(td); thread_lock_set(td, &ts->ts_lock); td->td_turnstile = NULL; /* Save who we are blocked on and switch. */ lock = ts->ts_lockobj; td->td_tsqueue = queue; td->td_blocked = ts; td->td_lockname = lock->lo_name; td->td_blktick = ticks; TD_SET_LOCK(td); mtx_unlock_spin(&tc->tc_lock); propagate_priority(td); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); SDT_PROBE0(sched, , , sleep); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); mi_switch(SW_VOL | SWT_TURNSTILE); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); } /* * Pick the highest priority thread on this turnstile and put it on the * pending list. This must be called with the turnstile chain locked. */ int turnstile_signal(struct turnstile *ts, int queue) { struct turnstile_chain *tc __unused; struct thread *td; int empty; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Pick the highest priority thread blocked on this lock and * move it to the pending list. */ td = TAILQ_FIRST(&ts->ts_blocked[queue]); MPASS(td->td_proc->p_magic == P_MAGIC); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); mtx_unlock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq); /* * If the turnstile is now empty, remove it from its chain and * give it to the about-to-be-woken thread. Otherwise take a * turnstile from the free list and give it to the thread. */ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]); if (empty) { tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(LIST_EMPTY(&ts->ts_free)); #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts = LIST_FIRST(&ts->ts_free); MPASS(ts != NULL); LIST_REMOVE(ts, ts_hash); td->td_turnstile = ts; return (empty); } /* * Put all blocked threads on the pending list. This must be called with * the turnstile chain locked. */ void turnstile_broadcast(struct turnstile *ts, int queue) { struct turnstile_chain *tc __unused; struct turnstile *ts1; struct thread *td; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); /* * We must have the chain locked so that we can remove the empty * turnstile from the hash queue. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Transfer the blocked list to the pending list. */ mtx_lock_spin(&td_contested_lock); TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq); mtx_unlock_spin(&td_contested_lock); /* * Give a turnstile to each thread. The last thread gets * this turnstile if the turnstile is empty. */ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) { if (LIST_EMPTY(&ts->ts_free)) { MPASS(TAILQ_NEXT(td, td_lockq) == NULL); ts1 = ts; #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts1 = LIST_FIRST(&ts->ts_free); MPASS(ts1 != NULL); LIST_REMOVE(ts1, ts_hash); td->td_turnstile = ts1; } } static u_char turnstile_calc_unlend_prio_locked(struct thread *td) { struct turnstile *nts; u_char cp, pri; THREAD_LOCK_ASSERT(td, MA_OWNED); mtx_assert(&td_contested_lock, MA_OWNED); pri = PRI_MAX; LIST_FOREACH(nts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(nts)->td_priority; if (cp < pri) pri = cp; } return (pri); } /* * Wakeup all threads on the pending list and adjust the priority of the * current thread appropriately. This must be called with the turnstile * chain locked. */ void turnstile_unpend(struct turnstile *ts) { TAILQ_HEAD( ,thread) pending_threads; struct thread *td; u_char pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(!TAILQ_EMPTY(&ts->ts_pending)); /* * Move the list of pending threads out of the turnstile and * into a local variable. */ TAILQ_INIT(&pending_threads); TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq); #ifdef INVARIANTS if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])) ts->ts_lockobj = NULL; #endif /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; thread_lock(td); mtx_lock_spin(&td_contested_lock); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. There might not be a current owner if this is a shared * lock. */ if (ts->ts_owner != NULL) { ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); } pri = turnstile_calc_unlend_prio_locked(td); mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); /* * Wake up all the pending threads. If a thread is not blocked * on a lock, then it is currently executing on another CPU in * turnstile_wait() or sitting on a run queue waiting to resume * in turnstile_wait(). Set a flag to force it to try to acquire * the lock again instead of blocking. */ while (!TAILQ_EMPTY(&pending_threads)) { td = TAILQ_FIRST(&pending_threads); TAILQ_REMOVE(&pending_threads, td, td_lockq); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); thread_lock_block_wait(td); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); MPASS(td->td_proc->p_magic == P_MAGIC); MPASS(TD_ON_LOCK(td)); TD_CLR_LOCK(td); MPASS(TD_CAN_RUN(td)); td->td_blocked = NULL; td->td_lockname = NULL; td->td_blktick = 0; #ifdef INVARIANTS td->td_tsqueue = 0xff; #endif sched_add(td, SRQ_HOLD | SRQ_BORING); } mtx_unlock_spin(&ts->ts_lock); } /* * Give up ownership of a turnstile. This must be called with the * turnstile chain locked. */ void turnstile_disown(struct turnstile *ts) { struct thread *td; u_char pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread); MPASS(TAILQ_EMPTY(&ts->ts_pending)); MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) || !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. */ mtx_lock_spin(&td_contested_lock); ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); mtx_unlock_spin(&td_contested_lock); /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; thread_lock(td); mtx_unlock_spin(&ts->ts_lock); mtx_lock_spin(&td_contested_lock); pri = turnstile_calc_unlend_prio_locked(td); mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); } /* * Return the first thread in a turnstile. */ struct thread * turnstile_head(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_FIRST(&ts->ts_blocked[queue])); } /* * Returns true if a sub-queue of a turnstile is empty. */ int turnstile_empty(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_EMPTY(&ts->ts_blocked[queue])); } #ifdef DDB static void print_thread(struct thread *td, const char *prefix) { db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid, td->td_proc->p_pid, td->td_name); } static void print_queue(struct threadqueue *queue, const char *header, const char *prefix) { struct thread *td; db_printf("%s:\n", header); if (TAILQ_EMPTY(queue)) { db_printf("%sempty\n", prefix); return; } TAILQ_FOREACH(td, queue, td_lockq) { print_thread(td, prefix); } } DB_SHOW_COMMAND(turnstile, db_show_turnstile) { struct turnstile_chain *tc; struct turnstile *ts; struct lock_object *lock; int i; if (!have_addr) return; /* * First, see if there is an active turnstile for the lock indicated * by the address. */ lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) goto found; /* * Second, see if there is an active turnstile at the address * indicated. */ for (i = 0; i < TC_TABLESIZE; i++) LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) { if (ts == (struct turnstile *)addr) goto found; } db_printf("Unable to locate a turnstile via %p\n", (void *)addr); return; found: lock = ts->ts_lockobj; db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); if (ts->ts_owner) print_thread(ts->ts_owner, "Lock Owner: "); else db_printf("Lock Owner: none\n"); print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t"); print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters", "\t"); print_queue(&ts->ts_pending, "Pending Threads", "\t"); } /* * Show all the threads a particular thread is waiting on based on * non-spin locks. */ static void print_lockchain(struct thread *td, const char *prefix) { struct lock_object *lock; struct lock_class *class; struct turnstile *ts; struct thread *owner; /* * Follow the chain. We keep walking as long as the thread is * blocked on a lock that has an owner. */ while (!db_pager_quit) { if (td == (void *)LK_KERNPROC) { db_printf("%sdisowned (LK_KERNPROC)\n", prefix); return; } db_printf("%sthread %d (pid %d, %s) is ", prefix, td->td_tid, td->td_proc->p_pid, td->td_name); switch (TD_GET_STATE(td)) { case TDS_INACTIVE: db_printf("inactive\n"); return; case TDS_CAN_RUN: db_printf("runnable\n"); return; case TDS_RUNQ: db_printf("on a run queue\n"); return; case TDS_RUNNING: db_printf("running on CPU %d\n", td->td_oncpu); return; case TDS_INHIBITED: if (TD_ON_LOCK(td)) { ts = td->td_blocked; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); db_printf("blocked on lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); if (ts->ts_owner == NULL) return; td = ts->ts_owner; break; } else if (TD_ON_SLEEPQ(td)) { if (!lockmgr_chain(td, &owner) && !sx_chain(td, &owner)) { db_printf("sleeping on %p \"%s\"\n", td->td_wchan, td->td_wmesg); return; } if (owner == NULL) return; td = owner; break; } db_printf("inhibited: %s\n", KTDSTATE(td)); return; default: db_printf("??? (%#x)\n", TD_GET_STATE(td)); return; } } } DB_SHOW_COMMAND(lockchain, db_show_lockchain) { struct thread *td; /* Figure out which thread to start with. */ if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; print_lockchain(td, ""); } DB_SHOW_ALIAS(sleepchain, db_show_lockchain); DB_SHOW_ALL_COMMAND(chains, db_show_allchains) { struct thread *td; struct proc *p; int i; i = 1; FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) { db_printf("chain %d:\n", i++); print_lockchain(td, " "); } if (db_pager_quit) return; } } } -DB_SHOW_ALIAS(allchains, db_show_allchains) +DB_SHOW_ALIAS_FLAGS(allchains, db_show_allchains, DB_CMD_MEMSAFE) static void print_waiters(struct turnstile *ts, int indent); static void print_waiter(struct thread *td, int indent) { struct turnstile *ts; int i; if (db_pager_quit) return; for (i = 0; i < indent; i++) db_printf(" "); print_thread(td, "thread "); LIST_FOREACH(ts, &td->td_contested, ts_link) print_waiters(ts, indent + 1); } static void print_waiters(struct turnstile *ts, int indent) { struct lock_object *lock; struct lock_class *class; struct thread *td; int i; if (db_pager_quit) return; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); for (i = 0; i < indent; i++) db_printf(" "); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) print_waiter(td, indent + 1); } DB_SHOW_COMMAND(locktree, db_show_locktree) { struct lock_object *lock; struct lock_class *class; struct turnstile_chain *tc; struct turnstile *ts; if (!have_addr) return; lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) break; if (ts == NULL) { class = LOCK_CLASS(lock); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); } else print_waiters(ts, 0); } #endif diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 57f63d362a4f..33b1b506f85f 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -1,3137 +1,3137 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ #define LI_SLEEPABLE 0x00040000 /* Lock may be held while sleeping. */ #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; struct witness_blessed { const char *b_lock1; const char *b_lock2; }; struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); static int blessed(struct witness *, struct witness *); static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_enter_debugger(const char *msg); static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_output_drain(void *arg __unused, const char *data, int len); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); FEATURE(witness, "kernel has witness(9) support"); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; #ifdef __i386__ static const char w_notallowed[] = "The sysctl is disabled on the arch\n"; #endif static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rm }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ { "in_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in_multi_list_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ { "in6_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in6_multi_list_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udpinp", &lock_class_rw }, { "udp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcpinp", &lock_class_rw }, { "tcp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_sx }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { "devthrd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VFS namecache */ { "ncvn", &lock_class_mtx_sleep }, { "ncbuc", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "ncneg", &lock_class_mtx_sleep }, { NULL, NULL }, /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * TCP log locks */ { "TCP ID tree", &lock_class_rw }, { "tcp log id bucket", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "TCP log expireq", &lock_class_mtx_sleep }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, { "process slock", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif { NULL, NULL }, { "sched lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { NULL, NULL }, /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; /* * Pairs of locks which have been blessed. Witness does not complain about * order problems with blessed lock pairs. Please do not add an entry to the * table without an explanatory comment. */ static struct witness_blessed blessed_list[] = { /* * See the comment in ufs_dirhash.c. Basically, a vnode lock serializes * both lock orders, so a deadlock cannot happen as a result of this * LOR. */ { "dirhash", "bufwait" }, /* * A UFS vnode may be locked in vget() while a buffer belonging to the * parent directory vnode is locked. */ { "ufs", "bufwait" }, }; /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * Calculate the size of early witness structures. */ int witness_startup_count(void) { int sz; sz = sizeof(struct witness) * witness_count; sz += sizeof(*w_rmatrix) * (witness_count + 1); sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) * (witness_count + 1); return (sz); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ void witness_startup(void *mem) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; uintptr_t p; int i; p = (uintptr_t)mem; w_data = (void *)p; p += sizeof(struct witness) * witness_count; w_rmatrix = (void *)p; p += sizeof(*w_rmatrix) * (witness_count + 1); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = (void *)p; p += sizeof(*w_rmatrix[i]) * (witness_count + 1); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || KERNEL_PANICKED() || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || KERNEL_PANICKED()) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || KERNEL_PANICKED()) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { struct stack pstack; bool pstackv, trace; MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_flags & LI_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && (lock1->li_flags & LI_SLEEPABLE) == 0) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_flags & LI_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); /* * If the lock order is blessed, bail before logging * anything. We don't look for other lock order * violations though, which may be a bug. */ if (blessed(w, w1)) goto out; trace = atomic_load_int(&witness_trace); if (trace) { struct witness_lock_order_data *data; pstackv = false; data = witness_lock_order_get(w, w1); if (data != NULL) { stack_copy(&data->wlod_stack, &pstack); pstackv = true; } } mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && (lock1->li_flags & LI_SLEEPABLE) == 0) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_flags & LI_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s, %s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, w1->w_class->lc_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n", lock, lock->lo_name, w->w_name, w->w_class->lc_name, fixup_filename(file), line); } else { struct witness *w2 = lock2->li_lock->lo_witness; witness_output(" 1st %p %s (%s, %s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, w2->w_name, w2->w_class->lc_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, w1->w_class->lc_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s, %s) @ %s:%d\n", lock, lock->lo_name, w->w_name, w->w_class->lc_name, fixup_filename(file), line); } if (trace) { char buf[64]; struct sbuf sb; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); if (pstackv) { sbuf_printf(&sb, "lock order %s -> %s established at:\n", w->w_name, w1->w_name); stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT, STACK_SBUF_FMT_LONG); } sbuf_printf(&sb, "lock order %s -> %s attempted at:\n", w1->w_name, w->w_name); stack_save(&pstack); stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT, STACK_SBUF_FMT_LONG); sbuf_finish(&sb); sbuf_delete(&sb); } witness_enter_debugger(__func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || KERNEL_PANICKED()) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; instance->li_flags = 0; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags |= LI_EXCLUSIVE; if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0) instance->li_flags |= LI_SLEEPABLE; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || KERNEL_PANICKED()) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || KERNEL_PANICKED()) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || KERNEL_PANICKED()) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_flags & LI_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, printf); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, printf); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; MPASS(description != NULL); if (witness_watch == -1 || KERNEL_PANICKED()) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; if (w->w_refcount == 1) w->w_class = lock_class; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; } else { w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } -DB_SHOW_ALIAS(alllocks, db_witness_list_all) +DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE) -DB_SHOW_COMMAND(witness, db_witness_display) +DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE) { witness_ddb_display(db_printf); } #endif static void sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; int generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); if (blessed(tmp_w1, tmp_w2)) continue; sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_printf(sb, "\n"); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_printf(sb, "\n"); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); } static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); sbuf_print_witness_badstacks(sb, &req->oldidx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } #ifdef DDB static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { return (db_printf("%.*s", len, data)); } -DB_SHOW_COMMAND(badstacks, db_witness_badstacks) +DB_SHOW_COMMAND_FLAGS(badstacks, db_witness_badstacks, DB_CMD_MEMSAFE) { struct sbuf sb; char buffer[128]; size_t dummy; sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); sbuf_print_witness_badstacks(&sb, &dummy); sbuf_finish(&sb); } #endif static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; #ifdef __i386__ error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed)); return (error); #endif if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "\n"); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } witness_enter_debugger(msg); } static void witness_enter_debugger(const char *msg) { #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 7724e66f7bad..8da40824e217 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,5613 +1,5613 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; bool bd_shutdown; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static char *buf; /* buffer header pool */ static struct buf * nbufp(unsigned i) { return ((struct buf *)(buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * i)); } caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_range(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * bufdaemon shutdown request and sleep channel. */ static bool bd_shutdown; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } static void bufspace_daemon_shutdown(void *arg, int howto __unused) { struct bufdomain *bd = arg; int error; BD_RUN_LOCK(bd); bd->bd_shutdown = true; wakeup(&bd->bd_running); error = msleep(&bd->bd_shutdown, BD_RUN_LOCKPTR(bd), 0, "bufspace_shutdown", 60 * hz); BD_RUN_UNLOCK(bd); if (error != 0) printf("bufspacedaemon wait error: %d\n", error); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd = arg; EVENTHANDLER_REGISTER(shutdown_pre_sync, bufspace_daemon_shutdown, bd, SHUTDOWN_PRI_LAST + 100); BD_RUN_LOCK(bd); while (!bd->bd_shutdown) { BD_RUN_UNLOCK(bd); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); if (bd->bd_shutdown) break; atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); } } wakeup(&bd->bd_shutdown); BD_RUN_UNLOCK(bd); kthread_exit(); } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { /* * This function and its results are protected by higher level * synchronization requiring vnode and buf locks to page in and * validate pages. */ if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > maxphys) maxbcachebuf = maxphys; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * With KASAN or KMSAN enabled, the kernel map is shadowed. Account for * this when sizing maps based on the amount of physical memory * available. */ #if defined(KASAN) physmem_est = (physmem_est * KASAN_SHADOW_SCALE) / (KASAN_SHADOW_SCALE + 1); #elif defined(KMSAN) physmem_est /= 3; /* * KMSAN cannot reliably determine whether buffer data is initialized * unless it is updated through a KVA mapping. */ unmapped_buf_allowed = 0; #endif /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > maxphys) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / maxphys; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } if (nswbuf == 0) { nswbuf = min(nbuf / 4, 256); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; } /* * Reserve space for the buffer cache buffers */ buf = (char *)v; v = (caddr_t)buf + (sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf)) * nbuf; return (v); } /* * Single global constant for BUF_WMESG, to avoid getting multiple * references. */ static const char buf_wmesg[] = "bufwait"; /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(maxphys); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = nbufp(i); bzero(bp, sizeof(*bp) + sizeof(vm_page_t) * atop(maxbcachebuf)); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp, buf_wmesg); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf) + sizeof(vm_page_t) * atop(maxbcachebuf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + maxphys, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int i, iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) nbusy++; } if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (i = nbuf - 1; i >= 0; i--) { bp = nbufp(i); if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ BOOTTRACE("shutdown failed to sync buffers"); printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ swapoff_all(); } else { BOOTTRACE("shutdown sync complete"); if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems and perform swapoff, to quiesce * the system as much as possible. In particular, no * I/O should be initiated from top levels since it * might be abruptly terminated by reset, or otherwise * erronously handled because other parts of the * system are disabled. * * Swapoff before unmount, because file-backed swap is * non-operational after unmount of the underlying * filesystem. */ if (!KERNEL_PANICKED()) { swapoff_all(); vfs_unmountall(); } BOOTTRACE("shutdown unmounted all filesystems"); } DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); MPASS((bp->b_flags & B_MAXPHYS) == 0); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs, error; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWITNESS, NULL); KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp, error)); (void)error; KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); MPASS((bp->b_flags & B_MAXPHYS) == 0); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(maxsize <= maxbcachebuf, ("bufkva_alloc kva too large %d %u", maxsize, maxbcachebuf)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } KASSERT(blkno == bp->b_lblkno, ("getblkx returned buffer for blkno %jd instead of blkno %jd", (intmax_t)bp->b_lblkno, (intmax_t)blkno)); flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; struct bufdomain *bd; bd = &bdomain[bo->bo_domain]; if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > bd->bd_dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("brelse: SU io not finished bp %p", bp)); } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_flags & B_INVALONERR)) { /* * Forced invalidation of dirty buffer contents, to be used * after a failed write in the rare case that the loss of the * contents is acceptable. The buffer is invalidated and * freed. */ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || vn_isdisk(bp->b_vp) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); bp->b_xflags &= ~(BX_CVTENXIO); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_xflags &= ~(BX_CVTENXIO); if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags &= ~B_IOSTARTED; } else { KASSERT((bp->b_flags & B_IOSTARTED) == 0, ("bqrelse: SU io not finished bp %p", bp)); } if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", blockcount_read(&obj->paging_in_progress), bp->b_npages)); vp = bp->b_vp; VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_object != NULL, vp); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_relookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_busy_acquire(m, VM_ALLOC_SBUSY); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_sunbusy(m); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; if (bp->b_npages < desiredpages) { KASSERT(desiredpages <= atop(maxbcachebuf), ("vfs_vmio_extend past maxbcachebuf %p %d %u", bp, desiredpages, maxbcachebuf)); /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = maxphys / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon_shutdown(void *arg __unused, int howto __unused) { int error; mtx_lock(&bdlock); bd_shutdown = true; wakeup(&bd_request); error = msleep(&bd_shutdown, &bdlock, 0, "buf_daemon_shutdown", 60 * hz); mtx_unlock(&bdlock); if (error != 0) printf("bufdaemon wait error: %d\n", error); } static void buf_daemon() { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, buf_daemon_shutdown, NULL, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); while (!bd_shutdown) { bd_request = 0; mtx_unlock(&bdlock); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (bd_shutdown) break; if (BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } wakeup(&bd_shutdown); mtx_unlock(&bdlock); kthread_exit(); } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW | CTLFLAG_STATS, &flushwithdeps, 0, "Number of buffers flushed with dependencies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { return (gbincore_unlocked(bo, blkno)); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ bool inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m, n; vm_ooffset_t off; int valid; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return (true); if (vp->v_mount == NULL) return (false); obj = vp->v_object; if (obj == NULL) return (false); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); recheck: if (m == NULL) return (false); tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); /* * Consider page validity only if page mapping didn't change * during the check. */ valid = vm_page_is_valid(m, (vm_offset_t)((toff + off) & PAGE_MASK), tinc); n = vm_page_lookup_unlocked(obj, OFF_TO_IDX(off + toff)); if (m != n) { m = n; goto recheck; } if (!valid) return (false); } return (true); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); vfs_busy_pages_acquire(bp); vfs_setdirty_range(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vfs_busy_pages_release(bp); } static void vfs_setdirty_range(struct buf *bp) { vm_offset_t boffset; vm_offset_t eoffset; int i; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whose * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (vp->v_type != VCHR) ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = dblkno; /* Attempt lockless lookup first. */ bp = gbincore_unlocked(bo, blkno); if (bp == NULL) { /* * With GB_NOCREAT we must be sure about not finding the buffer * as it may have been reassigned during unlocked lookup. */ if ((flags & GB_NOCREAT) != 0) goto loop; goto newbuf_unlocked; } error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL, "getblku", 0, 0); if (error != 0) goto loop; /* Verify buf identify has not changed since lookup. */ if (bp->b_bufobj == bo && bp->b_lblkno == blkno) goto foundbuf_fastpath; /* It changed, fallback to locked lookup. */ BUF_UNLOCK_RAW(bp); loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_INTERLOCK | ((flags & GB_LOCK_NOWAIT) != 0 ? LK_NOWAIT : LK_SLEEPFAIL); #ifdef WITNESS lockflags |= (flags & GB_NOWITNESS) != 0 ? LK_NOWITNESS : 0; #endif error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); foundbuf_fastpath: /* If recursed, assume caller knows the rules. */ if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); newbuf_unlocked: /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && !vn_isdisk(vp)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; /* * The check for done == biodone is to allow biodone to be * used as a bio_done routine. */ if (done == NULL || done == biodone) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wmesg) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wmesg, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Acquire a shared busy on all pages in the buf. */ void vfs_busy_pages_acquire(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY); } void vfs_busy_pages_release(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, bp->b_npages); vfs_busy_pages_acquire(bp); } if (bp->b_bufsize != 0) vfs_setdirty_range(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_assert_sbusied(m); /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (vm_page_all_valid(m) && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); /* * Busy may not be strictly necessary here because the pages are * unlikely to be fully valid and the vnode lock will synchronize * their access via getpages. It is grabbed for consistency with * other page validation. */ vfs_busy_pages_acquire(bp); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vfs_busy_pages_release(bp); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, sa, ea, slide, zbits; vm_page_bits_t mask; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages_acquire(bp); sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; zbits = (sizeof(vm_page_bits_t) * NBBY) - (ea - sa) / DEV_BSIZE; mask = (VM_PAGE_BITS_ALL >> zbits) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE, roundup2(ea - sa, DEV_BSIZE)); } vfs_busy_pages_release(bp); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; MPASS((bp->b_flags & B_MAXPHYS) == 0); KASSERT(to - from <= maxbcachebuf, ("vm_hold_load_pages too large %p %#jx %#jx %u", bp, (uintmax_t)from, (uintmax_t)to, maxbcachebuf)); for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc_noobj(VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; vm_page_unwire_noq(p); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, void *uaddr, size_t len, int mapbuf) { vm_prot_t prot; int pidx; MPASS((bp->b_flags & B_MAXPHYS) != 0); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uaddr, len, prot, bp->b_pages, PBUF_PAGES); if (pidx < 0) return (-1); bp->b_bufsize = len; bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)uaddr) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return (0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bo_bs, bsize; int br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; error = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)), &bo_bs); if (error != 0) return (VM_PAGER_ERROR); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; again: for (i = 0; i < count; i++) { if (ma[i] != bogus_page) vm_page_busy_downgrade(ma[i]); } lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; if (m == bogus_page) continue; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (vm_page_all_valid(m)) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; error = get_blksize(vp, lbn, &bsize); if (error == 0) error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (bp->b_rcred == curthread->td_ucred) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || !vm_page_all_valid(m)) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || vm_page_all_valid(m) || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { if (!vm_page_none_valid(m) && !vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); } next_page:; } end_pages: redo = false; for (i = 0; i < count; i++) { if (ma[i] == bogus_page) continue; if (vm_page_busy_tryupgrade(ma[i]) == 0) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex, VM_ALLOC_NORMAL); } /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (!vm_page_all_valid(ma[i])) redo = true; } if (redo && error == 0) goto again; return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS); db_printf("b_vflags=0x%b b_ioflags0x%b\n", (u_int)bp->b_vflags, PRINT_BUF_VFLAGS, (u_int)bp->b_ioflags, PRINT_BIO_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, " "b_vp = %p, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } -DB_SHOW_COMMAND(bufqueues, bufqueues) +DB_SHOW_COMMAND_FLAGS(bufqueues, bufqueues, DB_CMD_MEMSAFE) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i && BUF_ISLOCKED(bp)) { cnt++; total += bp->b_bufsize; } } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) { bp = nbufp(j); if (bp->b_domain == i) { cnt++; total += bp->b_bufsize; } } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } -DB_SHOW_COMMAND(lockedbufs, lockedbufs) +DB_SHOW_COMMAND_FLAGS(lockedbufs, lockedbufs, DB_CMD_MEMSAFE) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } -DB_COMMAND(countfreebufs, db_coundfreebufs) +DB_COMMAND_FLAGS(countfreebufs, db_coundfreebufs, DB_CMD_MEMSAFE) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = nbufp(i); if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a6344e139502..739d22089f97 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,7068 +1,7068 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vn_seqc_init(struct vnode *); static void vn_seqc_write_end_free(struct vnode *vp); static void vgonel(struct vnode *); static bool vhold_recycle_free(struct vnode *); static void vdropl_recycle(struct vnode *vp); static void vdrop_recycle(struct vnode *vp); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ #define vnsz2log 8 #ifndef DEBUG_LOCKS _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && sizeof(struct vnode) < 1UL << (vnsz2log + 1), "vnsz2log needs to be updated"); #endif /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } #ifdef KASAN static int vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) { kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); return (0); } static void vnode_dtor(void *mem, int size, void *arg __unused) { size_t end1, end2, off1, off2; _Static_assert(offsetof(struct vnode, v_vnodelist) < offsetof(struct vnode, v_dbatchcpu), "KASAN marks require updating"); off1 = offsetof(struct vnode, v_vnodelist); off2 = offsetof(struct vnode, v_dbatchcpu); end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); /* * Access to the v_vnodelist and v_dbatchcpu fields are permitted even * after the vnode has been freed. Try to get some KASAN coverage by * marking everything except those two fields as invalid. Because * KASAN's tracking is not byte-granular, any preceding fields sharing * the same 8-byte aligned word must also be marked valid. */ /* Handle the area from the start until v_vnodelist... */ off1 = rounddown2(off1, KASAN_SHADOW_SCALE); kasan_mark(mem, off1, off1, KASAN_UMA_FREED); /* ... then the area between v_vnodelist and v_dbatchcpu ... */ off1 = roundup2(end1, KASAN_SHADOW_SCALE); off2 = rounddown2(off2, KASAN_SHADOW_SCALE); if (off2 > off1) kasan_mark((void *)((char *)mem + off1), off2 - off1, off2 - off1, KASAN_UMA_FREED); /* ... and finally the area from v_dbatchcpu to the end. */ off2 = roundup2(end2, KASAN_SHADOW_SCALE); kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, KASAN_UMA_FREED); } #endif /* KASAN */ /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; /* * Check vhold_recycle_free for an explanation. */ vp->v_holdcnt = VHOLD_NO_SMR; vp->v_type = VNON; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); kasan_mark(mem, size, size, 0); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; uma_ctor ctor; uma_dtor dtor; int cpu, physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); #ifdef KASAN ctor = vnode_ctor; dtor = vnode_dtor; #else ctor = NULL; dtor = NULL; #endif vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); uma_zone_set_smr(vnode_zone, vfs_smr); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { struct mount_pcpu *mpcpu; MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_mp_count_add_pcpu(mpcpu, lockref, 1); vfs_op_thread_exit(mp, mpcpu); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("%s: non-empty upper mount list with pending unmount", __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { struct mount_pcpu *mpcpu; int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { goto next_iter; } /* * Handle races against vnode allocation. Filesystems lock the * vnode some time after it gets returned from getnewvnode, * despite type and hold count being manipulated earlier. * Resorting to checking v_mount restores guarantees present * before the global list was reworked to contain all vnodes. */ if (!VI_TRYLOCK(vp)) goto next_iter; if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); goto next_iter; } if (vp->v_mount == NULL) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop_recycle(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->handle == vp && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) { struct vnode *vp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; vp = mvp; for (;;) { if (count == 0) { break; } vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; if (vp->v_holdcnt > 0) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. */ if (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) { continue; } if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { continue; } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); /* * FIXME: ignores the return value, meaning it may be nothing * got recycled but it claims otherwise to the caller. * * Originally the value started being ignored in 2005 with * 114a1006a8204aa156e1f9ad6476cdff89cada7f . * * Respecting the value can run into significant stalls if most * vnodes belong to one file system and it has writes * suspended. In presence of many threads and millions of * vnodes they keep contending on the vnode_list_mtx lock only * to find vnodes they can't recycle. * * The solution would be to pre-check if the vnode is likely to * be recycle-able, but it needs to happen with the * vnode_list_mtx lock held. This runs into a problem where * VOP_GETWRITEMOUNT (currently needed to find out about if * writes are frozen) can take locks which LOR against it. * * Check nullfs for one example (null_getwritemount). */ vtryrecycle(vp); count--; mtx_lock(&vnode_list_mtx); vp = mvp; } return (ocount - count); } static int vnlru_free_locked(int count) { mtx_assert(&vnode_list_mtx, MA_OWNED); return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); } void vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) { MPASS(mnt_op != NULL); MPASS(mvp != NULL); VNPASS(mvp->v_type == VMARKER, mvp); mtx_lock(&vnode_list_mtx); vnlru_free_impl(count, mnt_op, mvp); mtx_unlock(&vnode_list_mtx); } struct vnode * vnlru_alloc_marker(void) { struct vnode *mvp; mvp = vn_alloc_marker(NULL); mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (mvp); } void vnlru_free_marker(struct vnode *mvp) { mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); vn_free_marker(mvp); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static __inline void vfs_freevnodes_inc(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; critical_exit(); } static __inline void vfs_freevnodes_dec(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); vdrop_recycle(vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); vdrop_recycle(vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; vp->v_irflag = 0; v_init_counters(vp); vn_seqc_init(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_free(vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); destroy_vpollinfo(vp->v_pollinfo); VOP_UNLOCK(vp); vp->v_pollinfo = NULL; } vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static int insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); } else { KASSERT(!dtr, ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", __func__)); } /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } /* * Insert into list of vnodes for the new mount point, if available. * insmntque() reclaims the vnode on insertion failure, insmntque1() * leaves handling of the vnode to the caller. */ int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, true)); } int insmntque1(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, false)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); do { error = BO_SYNC(bo, MNT_WAIT); } while (error == ERELOOKUP); if (error != 0) return (error); BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); return (EBUSY); } } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } static void vunlazy(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { if (func == VUNREF) { VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, ("recursive vunref")); vp->v_vflag |= VV_UNREF; } for (;;) { error = vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); if (error != ERELOOKUP || !want_unlock) break; VOP_LOCK(vp, LK_EXCLUSIVE); } if (func == VUNREF) vp->v_vflag &= ~VV_UNREF; vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vfs_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vfs_freevnodes_dec(); return (true); } } } /* * Hold a free vnode for recycling. * * Note: vnode_init references this comment. * * Attempts to recycle only need the global vnode list lock and have no use for * SMR. * * However, vnodes get inserted into the global list before they get fully * initialized and stay there until UMA decides to free the memory. This in * particular means the target can be found before it becomes usable and after * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to * VHOLD_NO_SMR. * * Note: the vnode may gain more references after we transition the count 0->1. */ static bool vhold_recycle_free(struct vnode *vp) { int count; mtx_assert(&vnode_list_mtx, MA_OWNED); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { vfs_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vfs_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } static void __always_inline vdropl_impl(struct vnode *vp, bool enqueue) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if (VN_IS_DOOMED(vp)) { vdropl_final(vp); return; } vfs_freevnodes_inc(); if (vp->v_mflag & VMP_LAZYLIST) { vunlazy(vp); } if (!enqueue) { VI_UNLOCK(vp); return; } /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ vdbatch_enqueue(vp); } void vdropl(struct vnode *vp) { vdropl_impl(vp, true); } /* * vdrop a vnode when recycling * * This is a special case routine only to be used when recycling, differs from * regular vdrop by not requeieing the vnode on LRU. * * Consider a case where vtryrecycle continuously fails with all vnodes (due to * e.g., frozen writes on the filesystem), filling the batch and causing it to * be requeued. Then vnlru will end up revisiting the same vnodes. This is a * loop which can last for as long as writes are frozen. */ static void vdropl_recycle(struct vnode *vp) { vdropl_impl(vp, false); } static void vdrop_recycle(struct vnode *vp) { VI_LOCK(vp); vdropl_recycle(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static int vinactivef(struct vnode *vp) { struct vm_object *obj; int error; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } error = VOP_INACTIVE(vp); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; return (error); } int vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return (0); if (vp->v_iflag & VI_DOINGINACT) return (0); if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return (0); } return (vinactivef(vp)); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } do { error = VOP_FSYNC(vp, MNT_WAIT, td); } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) { struct mount *mp; struct mount_upper_node *ump; mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_notify)) return; MNT_ILOCK(mp); mp->mnt_upper_pending++; KASSERT(mp->mnt_upper_pending > 0, ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump->mp, vp); break; } MNT_ILOCK(mp); } mp->mnt_upper_pending--; if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && mp->mnt_upper_pending == 0) { mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (VN_IS_DOOMED(vp)) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vn_irflag_set_locked(vp, VIRF_DOOMED); /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (!doinginact) { do { if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); } } while (oweinact); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; short irflag; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; irflag = vn_irflag_read(vp); if (irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); if (irflag & VIRF_MOUNTPOINT) strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); if (irflag & VIRF_TEXT_REF) strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT | VI_FOPENING); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); printf("\n"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ -DB_SHOW_COMMAND(lockedvnods, lockedvnodes) +DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOMSYNC); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_RECURSE); MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_NULL_NOCACHE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; int lkflags, objflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); free(vi, M_VNODEPOLL); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque1(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); VOP_UNLOCK(syncvp); save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (KERNEL_PANICKED() || vp == NULL) return; locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } static void vop_fsync_debugprepost(struct vnode *vp, const char *name) { if (vp->v_type == VCHR) ; else if (MNT_EXTENDED_SHARED(vp->v_mount)) ASSERT_VOP_LOCKED(vp, name); else ASSERT_VOP_ELOCKED(vp, name); } void vop_fsync_debugpre(void *a) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fsync_debugpost(void *a, int rc __unused) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpre(void *a) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpost(void *a, int rc __unused) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } #ifdef DEBUG_VFS_LOCKS void vop_mkdir_debugpost(void *ap, int rc) { struct vop_mkdir_args *a; a = ap; if (!rc) cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); } #endif void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_lock(void *arg, int what) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; if (what == LA_LOCKED) ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); else ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && kn->kn_filter != EVFILT_WRITE), ("READ/WRITE filter on a FIFO leaked through")); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct mount_pcpu *mpcpu; struct vnode *vp; int error; if (!vfs_op_thread_enter(mp, mpcpu)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp, mpcpu); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp, mpcpu); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } /* * Special case handling for allocating and freeing vnodes. * * The counter remains unchanged on free so that a doomed vnode will * keep testing as in modify as long as it is accessible with SMR. */ static void vn_seqc_init(struct vnode *vp) { vp->v_seqc = 0; vp->v_seqc_users = 0; } static void vn_seqc_write_end_free(struct vnode *vp) { VNPASS(seqc_in_modify(vp->v_seqc), vp); VNPASS(vp->v_seqc_users == 1, vp); } void vn_irflag_set_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & toset) == 0, vp, ("%s: some of the passed flags already set (have %d, passed %d)\n", __func__, flags, toset)); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_set_cond_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set_cond(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_cond_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_unset_locked(struct vnode *vp, short tounset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & tounset) == tounset, vp, ("%s: some of the passed flags not set (have %d, passed %d)\n", __func__, flags, tounset)); atomic_store_short(&vp->v_irflag, flags & ~tounset); } void vn_irflag_unset(struct vnode *vp, short tounset) { VI_LOCK(vp); vn_irflag_unset_locked(vp, tounset); VI_UNLOCK(vp); } diff --git a/sys/net/vnet.c b/sys/net/vnet.c index 2480fc8dd86c..4f242b07f169 100644 --- a/sys/net/vnet.c +++ b/sys/net/vnet.c @@ -1,810 +1,810 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2009 University of Zagreb * Copyright (c) 2006-2009 FreeBSD Foundation * All rights reserved. * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Copyright (c) 2009 Jeffrey Roberson * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include /*- * This file implements core functions for virtual network stacks: * * - Virtual network stack management functions. * * - Virtual network stack memory allocator, which virtualizes global * variables in the network stack * * - Virtualized SYSINIT's/SYSUNINIT's, which allow network stack subsystems * to register startup/shutdown events to be run for each virtual network * stack instance. */ FEATURE(vimage, "VIMAGE kernel virtualization"); static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* * The virtual network stack list has two read-write locks, one sleepable and * the other not, so that the list can be stablized and walked in a variety * of network stack contexts. Both must be acquired exclusively to modify * the list, but a read lock of either lock is sufficient to walk the list. */ struct rwlock vnet_rwlock; struct sx vnet_sxlock; #define VNET_LIST_WLOCK() do { \ sx_xlock(&vnet_sxlock); \ rw_wlock(&vnet_rwlock); \ } while (0) #define VNET_LIST_WUNLOCK() do { \ rw_wunlock(&vnet_rwlock); \ sx_xunlock(&vnet_sxlock); \ } while (0) struct vnet_list_head vnet_head; struct vnet *vnet0; /* * The virtual network stack allocator provides storage for virtualized * global variables. These variables are defined/declared using the * VNET_DEFINE()/VNET_DECLARE() macros, which place them in the 'set_vnet' * linker set. The details of the implementation are somewhat subtle, but * allow the majority of most network subsystems to maintain * virtualization-agnostic. * * The virtual network stack allocator handles variables in the base kernel * vs. modules in similar but different ways. In both cases, virtualized * global variables are marked as such by being declared to be part of the * vnet linker set. These "master" copies of global variables serve two * functions: * * (1) They contain static initialization or "default" values for global * variables which will be propagated to each virtual network stack * instance when created. As with normal global variables, they default * to zero-filled. * * (2) They act as unique global names by which the variable can be referred * to, regardless of network stack instance. The single global symbol * will be used to calculate the location of a per-virtual instance * variable at run-time. * * Each virtual network stack instance has a complete copy of each * virtualized global variable, stored in a malloc'd block of memory * referred to by vnet->vnet_data_mem. Critical to the design is that each * per-instance memory block is laid out identically to the master block so * that the offset of each global variable is the same across all blocks. To * optimize run-time access, a precalculated 'base' address, * vnet->vnet_data_base, is stored in each vnet, and is the amount that can * be added to the address of a 'master' instance of a variable to get to the * per-vnet instance. * * Virtualized global variables are handled in a similar manner, but as each * module has its own 'set_vnet' linker set, and we want to keep all * virtualized globals togther, we reserve space in the kernel's linker set * for potential module variables using a per-vnet character array, * 'modspace'. The virtual network stack allocator maintains a free list to * track what space in the array is free (all, initially) and as modules are * linked, allocates portions of the space to specific globals. The kernel * module linker queries the virtual network stack allocator and will * bind references of the global to the location during linking. It also * calls into the virtual network stack allocator, once the memory is * initialized, in order to propagate the new static initializations to all * existing virtual network stack instances so that the soon-to-be executing * module will find every network stack instance with proper default values. */ /* * Number of bytes of data in the 'set_vnet' linker set, and hence the total * size of all kernel virtualized global variables, and the malloc(9) type * that will be used to allocate it. */ #define VNET_BYTES (VNET_STOP - VNET_START) static MALLOC_DEFINE(M_VNET_DATA, "vnet_data", "VNET data"); /* * VNET_MODMIN is the minimum number of bytes we will reserve for the sum of * global variables across all loaded modules. As this actually sizes an * array declared as a virtualized global variable in the kernel itself, and * we want the virtualized global variable space to be page-sized, we may * have more space than that in practice. */ #define VNET_MODMIN (8 * PAGE_SIZE) #define VNET_SIZE roundup2(VNET_BYTES, PAGE_SIZE) /* * Space to store virtualized global variables from loadable kernel modules, * and the free list to manage it. */ VNET_DEFINE_STATIC(char, modspace[VNET_MODMIN] __aligned(__alignof(void *))); /* * Global lists of subsystem constructor and destructors for vnets. They are * registered via VNET_SYSINIT() and VNET_SYSUNINIT(). Both lists are * protected by the vnet_sysinit_sxlock global lock. */ static TAILQ_HEAD(vnet_sysinit_head, vnet_sysinit) vnet_constructors = TAILQ_HEAD_INITIALIZER(vnet_constructors); static TAILQ_HEAD(vnet_sysuninit_head, vnet_sysinit) vnet_destructors = TAILQ_HEAD_INITIALIZER(vnet_destructors); struct sx vnet_sysinit_sxlock; #define VNET_SYSINIT_WLOCK() sx_xlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_WUNLOCK() sx_xunlock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RLOCK() sx_slock(&vnet_sysinit_sxlock); #define VNET_SYSINIT_RUNLOCK() sx_sunlock(&vnet_sysinit_sxlock); struct vnet_data_free { uintptr_t vnd_start; int vnd_len; TAILQ_ENTRY(vnet_data_free) vnd_link; }; static MALLOC_DEFINE(M_VNET_DATA_FREE, "vnet_data_free", "VNET resource accounting"); static TAILQ_HEAD(, vnet_data_free) vnet_data_free_head = TAILQ_HEAD_INITIALIZER(vnet_data_free_head); static struct sx vnet_data_free_lock; SDT_PROVIDER_DEFINE(vnet); SDT_PROBE_DEFINE1(vnet, functions, vnet_alloc, entry, "int"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, alloc, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_alloc, return, "int", "struct vnet *"); SDT_PROBE_DEFINE2(vnet, functions, vnet_destroy, entry, "int", "struct vnet *"); SDT_PROBE_DEFINE1(vnet, functions, vnet_destroy, return, "int"); #ifdef DDB static void db_show_vnet_print_vs(struct vnet_sysinit *, int); #endif /* * Allocate a virtual network stack. */ struct vnet * vnet_alloc(void) { struct vnet *vnet; SDT_PROBE1(vnet, functions, vnet_alloc, entry, __LINE__); vnet = malloc(sizeof(struct vnet), M_VNET, M_WAITOK | M_ZERO); vnet->vnet_magic_n = VNET_MAGIC_N; SDT_PROBE2(vnet, functions, vnet_alloc, alloc, __LINE__, vnet); /* * Allocate storage for virtualized global variables and copy in * initial values form our 'master' copy. */ vnet->vnet_data_mem = malloc(VNET_SIZE, M_VNET_DATA, M_WAITOK); memcpy(vnet->vnet_data_mem, (void *)VNET_START, VNET_BYTES); /* * All use of vnet-specific data will immediately subtract VNET_START * from the base memory pointer, so pre-calculate that now to avoid * it on each use. */ vnet->vnet_data_base = (uintptr_t)vnet->vnet_data_mem - VNET_START; /* Initialize / attach vnet module instances. */ CURVNET_SET_QUIET(vnet); vnet_sysinit(); CURVNET_RESTORE(); VNET_LIST_WLOCK(); LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); VNET_LIST_WUNLOCK(); SDT_PROBE2(vnet, functions, vnet_alloc, return, __LINE__, vnet); return (vnet); } /* * Destroy a virtual network stack. */ void vnet_destroy(struct vnet *vnet) { SDT_PROBE2(vnet, functions, vnet_destroy, entry, __LINE__, vnet); KASSERT(vnet->vnet_sockcnt == 0, ("%s: vnet still has sockets", __func__)); VNET_LIST_WLOCK(); LIST_REMOVE(vnet, vnet_le); VNET_LIST_WUNLOCK(); /* Signal that VNET is being shutdown. */ vnet->vnet_shutdown = true; CURVNET_SET_QUIET(vnet); sx_xlock(&ifnet_detach_sxlock); vnet_sysuninit(); sx_xunlock(&ifnet_detach_sxlock); CURVNET_RESTORE(); /* * Release storage for the virtual network stack instance. */ free(vnet->vnet_data_mem, M_VNET_DATA); vnet->vnet_data_mem = NULL; vnet->vnet_data_base = 0; vnet->vnet_magic_n = 0xdeadbeef; free(vnet, M_VNET); SDT_PROBE1(vnet, functions, vnet_destroy, return, __LINE__); } /* * Boot time initialization and allocation of virtual network stacks. */ static void vnet_init_prelink(void *arg __unused) { rw_init(&vnet_rwlock, "vnet_rwlock"); sx_init(&vnet_sxlock, "vnet_sxlock"); sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void vnet0_init(void *arg __unused) { if (bootverbose) printf("VIMAGE (virtualized network stack) enabled\n"); /* * We MUST clear curvnet in vi_init_done() before going SMP, * otherwise CURVNET_SET() macros would scream about unnecessary * curvnet recursions. */ curvnet = prison0.pr_vnet = vnet0 = vnet_alloc(); } SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL); static void vnet_init_done(void *unused __unused) { curvnet = NULL; } SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_init_done, NULL); /* * Once on boot, initialize the modspace freelist to entirely cover modspace. */ static void vnet_data_startup(void *dummy __unused) { struct vnet_data_free *df; df = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); df->vnd_start = (uintptr_t)&VNET_NAME(modspace); df->vnd_len = VNET_MODMIN; TAILQ_INSERT_HEAD(&vnet_data_free_head, df, vnd_link); sx_init(&vnet_data_free_lock, "vnet_data alloc lock"); } SYSINIT(vnet_data, SI_SUB_KLD, SI_ORDER_FIRST, vnet_data_startup, NULL); /* Dummy VNET_SYSINIT to make sure we always reach the final end state. */ static void vnet_sysinit_done(void *unused __unused) { return; } VNET_SYSINIT(vnet_sysinit_done, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_sysinit_done, NULL); /* * When a module is loaded and requires storage for a virtualized global * variable, allocate space from the modspace free list. This interface * should be used only by the kernel linker. */ void * vnet_data_alloc(int size) { struct vnet_data_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_len < size) continue; if (df->vnd_len == size) { s = (void *)df->vnd_start; TAILQ_REMOVE(&vnet_data_free_head, df, vnd_link); free(df, M_VNET_DATA_FREE); break; } s = (void *)df->vnd_start; df->vnd_len -= size; df->vnd_start = df->vnd_start + size; break; } sx_xunlock(&vnet_data_free_lock); return (s); } /* * Free space for a virtualized global variable on module unload. */ void vnet_data_free(void *start_arg, int size) { struct vnet_data_free *df; struct vnet_data_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)start_arg; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&vnet_data_free_lock); TAILQ_FOREACH(df, &vnet_data_free_head, vnd_link) { if (df->vnd_start > end) break; /* * If we expand at the end of an entry we may have to merge * it with the one following it as well. */ if (df->vnd_start + df->vnd_len == start) { df->vnd_len += size; dn = TAILQ_NEXT(df, vnd_link); if (df->vnd_start + df->vnd_len == dn->vnd_start) { df->vnd_len += dn->vnd_len; TAILQ_REMOVE(&vnet_data_free_head, dn, vnd_link); free(dn, M_VNET_DATA_FREE); } sx_xunlock(&vnet_data_free_lock); return; } if (df->vnd_start == end) { df->vnd_start = start; df->vnd_len += size; sx_xunlock(&vnet_data_free_lock); return; } } dn = malloc(sizeof(*df), M_VNET_DATA_FREE, M_WAITOK | M_ZERO); dn->vnd_start = start; dn->vnd_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, vnd_link); else TAILQ_INSERT_TAIL(&vnet_data_free_head, dn, vnd_link); sx_xunlock(&vnet_data_free_lock); } /* * When a new virtualized global variable has been allocated, propagate its * initial value to each already-allocated virtual network stack instance. */ void vnet_data_copy(void *start, int size) { struct vnet *vnet; VNET_LIST_RLOCK(); LIST_FOREACH(vnet, &vnet_head, vnet_le) memcpy((void *)((uintptr_t)vnet->vnet_data_base + (uintptr_t)start), start, size); VNET_LIST_RUNLOCK(); } /* * Support for special SYSINIT handlers registered via VNET_SYSINIT() * and VNET_SYSUNINIT(). */ void vnet_register_sysinit(void *arg) { struct vnet_sysinit *vs, *vs2; struct vnet *vnet; vs = arg; KASSERT(vs->subsystem > SI_SUB_VNET, ("vnet sysinit too early")); /* Add the constructor to the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_constructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_constructors, vs, link); /* * Invoke the constructor on all the existing vnets when it is * registered. */ VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysinit(void *arg) { struct vnet_sysinit *vs; vs = arg; /* Remove the constructor from the global list of vnet constructors. */ VNET_SYSINIT_WLOCK(); TAILQ_REMOVE(&vnet_constructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_register_sysuninit(void *arg) { struct vnet_sysinit *vs, *vs2; vs = arg; /* Add the destructor to the global list of vnet destructors. */ VNET_SYSINIT_WLOCK(); TAILQ_FOREACH(vs2, &vnet_destructors, link) { if (vs2->subsystem > vs->subsystem) break; if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) break; } if (vs2 != NULL) TAILQ_INSERT_BEFORE(vs2, vs, link); else TAILQ_INSERT_TAIL(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } void vnet_deregister_sysuninit(void *arg) { struct vnet_sysinit *vs; struct vnet *vnet; vs = arg; /* * Invoke the destructor on all the existing vnets when it is * deregistered. */ VNET_SYSINIT_WLOCK(); VNET_FOREACH(vnet) { CURVNET_SET_QUIET(vnet); vs->func(vs->arg); CURVNET_RESTORE(); } /* Remove the destructor from the global list of vnet destructors. */ TAILQ_REMOVE(&vnet_destructors, vs, link); VNET_SYSINIT_WUNLOCK(); } /* * Invoke all registered vnet constructors on the current vnet. Used during * vnet construction. The caller is responsible for ensuring the new vnet is * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ void vnet_sysinit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH(vs, &vnet_constructors, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * Invoke all registered vnet destructors on the current vnet. Used during * vnet destruction. The caller is responsible for ensuring the dying vnet * the current vnet and that the vnet_sysinit_sxlock lock is locked. */ void vnet_sysuninit(void) { struct vnet_sysinit *vs; VNET_SYSINIT_RLOCK(); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { curvnet->vnet_state = vs->subsystem; vs->func(vs->arg); } VNET_SYSINIT_RUNLOCK(); } /* * EVENTHANDLER(9) extensions. */ /* * Invoke the eventhandler function originally registered with the possibly * registered argument for all virtual network stack instances. * * This iterator can only be used for eventhandlers that do not take any * additional arguments, as we do ignore the variadic arguments from the * EVENTHANDLER_INVOKE() call. */ void vnet_global_eventhandler_iterator_func(void *arg, ...) { VNET_ITERATOR_DECL(vnet_iter); struct eventhandler_entry_vimage *v_ee; /* * There is a bug here in that we should actually cast things to * (struct eventhandler_entry_ ## name *) but that's not easily * possible in here so just re-using the variadic version we * defined for the generic vimage case. */ v_ee = arg; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } #ifdef VNET_DEBUG struct vnet_recursion { SLIST_ENTRY(vnet_recursion) vnr_le; const char *prev_fn; const char *where_fn; int where_line; struct vnet *old_vnet; struct vnet *new_vnet; }; static SLIST_HEAD(, vnet_recursion) vnet_recursions = SLIST_HEAD_INITIALIZER(vnet_recursions); static void vnet_print_recursion(struct vnet_recursion *vnr, int brief) { if (!brief) printf("CURVNET_SET() recursion in "); printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, vnr->prev_fn); if (brief) printf(", "); else printf("\n "); printf("%p -> %p\n", vnr->old_vnet, vnr->new_vnet); } void vnet_log_recursion(struct vnet *old_vnet, const char *old_fn, int line) { struct vnet_recursion *vnr; /* Skip already logged recursion events. */ SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) if (vnr->prev_fn == old_fn && vnr->where_fn == curthread->td_vnet_lpush && vnr->where_line == line && (vnr->old_vnet == vnr->new_vnet) == (curvnet == old_vnet)) return; vnr = malloc(sizeof(*vnr), M_VNET, M_NOWAIT | M_ZERO); if (vnr == NULL) panic("%s: malloc failed", __func__); vnr->prev_fn = old_fn; vnr->where_fn = curthread->td_vnet_lpush; vnr->where_line = line; vnr->old_vnet = old_vnet; vnr->new_vnet = curvnet; SLIST_INSERT_HEAD(&vnet_recursions, vnr, vnr_le); vnet_print_recursion(vnr, 0); #ifdef KDB kdb_backtrace(); #endif } #endif /* VNET_DEBUG */ /* * DDB(4). */ #ifdef DDB static void db_vnet_print(struct vnet *vnet) { db_printf("vnet = %p\n", vnet); db_printf(" vnet_magic_n = %#08x (%s, orig %#08x)\n", vnet->vnet_magic_n, (vnet->vnet_magic_n == VNET_MAGIC_N) ? "ok" : "mismatch", VNET_MAGIC_N); db_printf(" vnet_ifcnt = %u\n", vnet->vnet_ifcnt); db_printf(" vnet_sockcnt = %u\n", vnet->vnet_sockcnt); db_printf(" vnet_data_mem = %p\n", vnet->vnet_data_mem); db_printf(" vnet_data_base = %#jx\n", (uintmax_t)vnet->vnet_data_base); db_printf(" vnet_state = %#08x\n", vnet->vnet_state); db_printf(" vnet_shutdown = %#03x\n", vnet->vnet_shutdown); db_printf("\n"); } DB_SHOW_ALL_COMMAND(vnets, db_show_all_vnets) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { db_vnet_print(vnet_iter); if (db_pager_quit) break; } } DB_SHOW_COMMAND(vnet, db_show_vnet) { if (!have_addr) { db_printf("usage: show vnet \n"); return; } db_vnet_print((struct vnet *)addr); } static void db_show_vnet_print_vs(struct vnet_sysinit *vs, int ddb) { const char *vsname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (vs == NULL) { xprint("%s: no vnet_sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); db_symbol_values(sym, &vsname, NULL); sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); xprint(" %#08x %#08x\n", vs->subsystem, vs->order); xprint(" %p(%s)(%p)\n", vs->func, (funcname != NULL) ? funcname : "", vs->arg); #undef xprint } -DB_SHOW_COMMAND(vnet_sysinit, db_show_vnet_sysinit) +DB_SHOW_COMMAND_FLAGS(vnet_sysinit, db_show_vnet_sysinit, DB_CMD_MEMSAFE) { struct vnet_sysinit *vs; db_printf("VNET_SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH(vs, &vnet_constructors, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } -DB_SHOW_COMMAND(vnet_sysuninit, db_show_vnet_sysuninit) +DB_SHOW_COMMAND_FLAGS(vnet_sysuninit, db_show_vnet_sysuninit, DB_CMD_MEMSAFE) { struct vnet_sysinit *vs; db_printf("VNET_SYSUNINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); TAILQ_FOREACH_REVERSE(vs, &vnet_destructors, vnet_sysuninit_head, link) { db_show_vnet_print_vs(vs, 1); if (db_pager_quit) break; } } #ifdef VNET_DEBUG -DB_SHOW_COMMAND(vnetrcrs, db_show_vnetrcrs) +DB_SHOW_COMMAND_FLAGS(vnetrcrs, db_show_vnetrcrs, DB_CMD_MEMSAFE) { struct vnet_recursion *vnr; SLIST_FOREACH(vnr, &vnet_recursions, vnr_le) vnet_print_recursion(vnr, 1); } #endif #endif /* DDB */ diff --git a/sys/opencrypto/crypto.c b/sys/opencrypto/crypto.c index 7231b390302f..ec2a639e04ce 100644 --- a/sys/opencrypto/crypto.c +++ b/sys/opencrypto/crypto.c @@ -1,2023 +1,2023 @@ /*- * Copyright (c) 2002-2006 Sam Leffler. All rights reserved. * Copyright (c) 2021 The FreeBSD Foundation * * Portions of this software were developed by Ararat River * Consulting, LLC under sponsorship of the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Cryptographic Subsystem. * * This code is derived from the Openbsd Cryptographic Framework (OCF) * that has the copyright shown below. Very little of the original * code remains. */ /*- * The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu) * * This code was written by Angelos D. Keromytis in Athens, Greece, in * February 2000. Network Security Technologies Inc. (NSTI) kindly * supported the development of this code. * * Copyright (c) 2000, 2001 Angelos D. Keromytis * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all source code copies of any software which is or includes a copy or * modification of this software. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "opt_compat.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cryptodev_if.h" #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) #include #endif SDT_PROVIDER_DEFINE(opencrypto); /* * Crypto drivers register themselves by allocating a slot in the * crypto_drivers table with crypto_get_driverid(). */ static struct mtx crypto_drivers_mtx; /* lock on driver table */ #define CRYPTO_DRIVER_LOCK() mtx_lock(&crypto_drivers_mtx) #define CRYPTO_DRIVER_UNLOCK() mtx_unlock(&crypto_drivers_mtx) #define CRYPTO_DRIVER_ASSERT() mtx_assert(&crypto_drivers_mtx, MA_OWNED) /* * Crypto device/driver capabilities structure. * * Synchronization: * (d) - protected by CRYPTO_DRIVER_LOCK() * (q) - protected by CRYPTO_Q_LOCK() * Not tagged fields are read-only. */ struct cryptocap { device_t cc_dev; uint32_t cc_hid; uint32_t cc_sessions; /* (d) # of sessions */ int cc_flags; /* (d) flags */ #define CRYPTOCAP_F_CLEANUP 0x80000000 /* needs resource cleanup */ int cc_qblocked; /* (q) symmetric q blocked */ size_t cc_session_size; volatile int cc_refs; }; static struct cryptocap **crypto_drivers = NULL; static int crypto_drivers_size = 0; struct crypto_session { struct cryptocap *cap; struct crypto_session_params csp; uint64_t id; /* Driver softc follows. */ }; static int crp_sleep = 0; static TAILQ_HEAD(cryptop_q ,cryptop) crp_q; /* request queues */ static struct mtx crypto_q_mtx; #define CRYPTO_Q_LOCK() mtx_lock(&crypto_q_mtx) #define CRYPTO_Q_UNLOCK() mtx_unlock(&crypto_q_mtx) SYSCTL_NODE(_kern, OID_AUTO, crypto, CTLFLAG_RW, 0, "In-kernel cryptography"); /* * Taskqueue used to dispatch the crypto requests * that have the CRYPTO_F_ASYNC flag */ static struct taskqueue *crypto_tq; /* * Crypto seq numbers are operated on with modular arithmetic */ #define CRYPTO_SEQ_GT(a,b) ((int)((a)-(b)) > 0) struct crypto_ret_worker { struct mtx crypto_ret_mtx; TAILQ_HEAD(,cryptop) crp_ordered_ret_q; /* ordered callback queue for symetric jobs */ TAILQ_HEAD(,cryptop) crp_ret_q; /* callback queue for symetric jobs */ uint32_t reorder_ops; /* total ordered sym jobs received */ uint32_t reorder_cur_seq; /* current sym job dispatched */ struct thread *td; }; static struct crypto_ret_worker *crypto_ret_workers = NULL; #define CRYPTO_RETW(i) (&crypto_ret_workers[i]) #define CRYPTO_RETW_ID(w) ((w) - crypto_ret_workers) #define FOREACH_CRYPTO_RETW(w) \ for (w = crypto_ret_workers; w < crypto_ret_workers + crypto_workers_num; ++w) #define CRYPTO_RETW_LOCK(w) mtx_lock(&w->crypto_ret_mtx) #define CRYPTO_RETW_UNLOCK(w) mtx_unlock(&w->crypto_ret_mtx) static int crypto_workers_num = 0; SYSCTL_INT(_kern_crypto, OID_AUTO, num_workers, CTLFLAG_RDTUN, &crypto_workers_num, 0, "Number of crypto workers used to dispatch crypto jobs"); #ifdef COMPAT_FREEBSD12 SYSCTL_INT(_kern, OID_AUTO, crypto_workers_num, CTLFLAG_RDTUN, &crypto_workers_num, 0, "Number of crypto workers used to dispatch crypto jobs"); #endif static uma_zone_t cryptop_zone; int crypto_devallowsoft = 0; SYSCTL_INT(_kern_crypto, OID_AUTO, allow_soft, CTLFLAG_RWTUN, &crypto_devallowsoft, 0, "Enable use of software crypto by /dev/crypto"); #ifdef COMPAT_FREEBSD12 SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RWTUN, &crypto_devallowsoft, 0, "Enable/disable use of software crypto by /dev/crypto"); #endif #ifdef DIAGNOSTIC bool crypto_destroyreq_check; SYSCTL_BOOL(_kern_crypto, OID_AUTO, destroyreq_check, CTLFLAG_RWTUN, &crypto_destroyreq_check, 0, "Enable checks when destroying a request"); #endif MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records"); static void crypto_dispatch_thread(void *arg); static struct thread *cryptotd; static void crypto_ret_thread(void *arg); static void crypto_destroy(void); static int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint); static void crypto_task_invoke(void *ctx, int pending); static void crypto_batch_enqueue(struct cryptop *crp); static counter_u64_t cryptostats[sizeof(struct cryptostats) / sizeof(uint64_t)]; SYSCTL_COUNTER_U64_ARRAY(_kern_crypto, OID_AUTO, stats, CTLFLAG_RW, cryptostats, nitems(cryptostats), "Crypto system statistics"); #define CRYPTOSTAT_INC(stat) do { \ counter_u64_add( \ cryptostats[offsetof(struct cryptostats, stat) / sizeof(uint64_t)],\ 1); \ } while (0) static void cryptostats_init(void *arg __unused) { COUNTER_ARRAY_ALLOC(cryptostats, nitems(cryptostats), M_WAITOK); } SYSINIT(cryptostats_init, SI_SUB_COUNTER, SI_ORDER_ANY, cryptostats_init, NULL); static void cryptostats_fini(void *arg __unused) { COUNTER_ARRAY_FREE(cryptostats, nitems(cryptostats)); } SYSUNINIT(cryptostats_fini, SI_SUB_COUNTER, SI_ORDER_ANY, cryptostats_fini, NULL); /* Try to avoid directly exposing the key buffer as a symbol */ static struct keybuf *keybuf; static struct keybuf empty_keybuf = { .kb_nents = 0 }; /* Obtain the key buffer from boot metadata */ static void keybuf_init(void) { caddr_t kmdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); keybuf = (struct keybuf *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_KEYBUF); if (keybuf == NULL) keybuf = &empty_keybuf; } /* It'd be nice if we could store these in some kind of secure memory... */ struct keybuf * get_keybuf(void) { return (keybuf); } static struct cryptocap * cap_ref(struct cryptocap *cap) { refcount_acquire(&cap->cc_refs); return (cap); } static void cap_rele(struct cryptocap *cap) { if (refcount_release(&cap->cc_refs) == 0) return; KASSERT(cap->cc_sessions == 0, ("freeing crypto driver with active sessions")); free(cap, M_CRYPTO_DATA); } static int crypto_init(void) { struct crypto_ret_worker *ret_worker; struct proc *p; int error; mtx_init(&crypto_drivers_mtx, "crypto driver table", NULL, MTX_DEF); TAILQ_INIT(&crp_q); mtx_init(&crypto_q_mtx, "crypto op queues", NULL, MTX_DEF); cryptop_zone = uma_zcreate("cryptop", sizeof(struct cryptop), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); crypto_drivers_size = CRYPTO_DRIVERS_INITIAL; crypto_drivers = malloc(crypto_drivers_size * sizeof(struct cryptocap), M_CRYPTO_DATA, M_WAITOK | M_ZERO); if (crypto_workers_num < 1 || crypto_workers_num > mp_ncpus) crypto_workers_num = mp_ncpus; crypto_tq = taskqueue_create("crypto", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &crypto_tq); taskqueue_start_threads(&crypto_tq, crypto_workers_num, PRI_MIN_KERN, "crypto"); p = NULL; error = kproc_kthread_add(crypto_dispatch_thread, NULL, &p, &cryptotd, 0, 0, "crypto", "crypto"); if (error) { printf("crypto_init: cannot start crypto thread; error %d", error); goto bad; } crypto_ret_workers = mallocarray(crypto_workers_num, sizeof(struct crypto_ret_worker), M_CRYPTO_DATA, M_WAITOK | M_ZERO); FOREACH_CRYPTO_RETW(ret_worker) { TAILQ_INIT(&ret_worker->crp_ordered_ret_q); TAILQ_INIT(&ret_worker->crp_ret_q); ret_worker->reorder_ops = 0; ret_worker->reorder_cur_seq = 0; mtx_init(&ret_worker->crypto_ret_mtx, "crypto return queues", NULL, MTX_DEF); error = kthread_add(crypto_ret_thread, ret_worker, p, &ret_worker->td, 0, 0, "crypto returns %td", CRYPTO_RETW_ID(ret_worker)); if (error) { printf("crypto_init: cannot start cryptoret thread; error %d", error); goto bad; } } keybuf_init(); return 0; bad: crypto_destroy(); return error; } /* * Signal a crypto thread to terminate. We use the driver * table lock to synchronize the sleep/wakeups so that we * are sure the threads have terminated before we release * the data structures they use. See crypto_finis below * for the other half of this song-and-dance. */ static void crypto_terminate(struct thread **tdp, void *q) { struct thread *td; mtx_assert(&crypto_drivers_mtx, MA_OWNED); td = *tdp; *tdp = NULL; if (td != NULL) { wakeup_one(q); mtx_sleep(td, &crypto_drivers_mtx, PWAIT, "crypto_destroy", 0); } } static void hmac_init_pad(const struct auth_hash *axf, const char *key, int klen, void *auth_ctx, uint8_t padval) { uint8_t hmac_key[HMAC_MAX_BLOCK_LEN]; u_int i; KASSERT(axf->blocksize <= sizeof(hmac_key), ("Invalid HMAC block size %d", axf->blocksize)); /* * If the key is larger than the block size, use the digest of * the key as the key instead. */ memset(hmac_key, 0, sizeof(hmac_key)); if (klen > axf->blocksize) { axf->Init(auth_ctx); axf->Update(auth_ctx, key, klen); axf->Final(hmac_key, auth_ctx); klen = axf->hashsize; } else memcpy(hmac_key, key, klen); for (i = 0; i < axf->blocksize; i++) hmac_key[i] ^= padval; axf->Init(auth_ctx); axf->Update(auth_ctx, hmac_key, axf->blocksize); explicit_bzero(hmac_key, sizeof(hmac_key)); } void hmac_init_ipad(const struct auth_hash *axf, const char *key, int klen, void *auth_ctx) { hmac_init_pad(axf, key, klen, auth_ctx, HMAC_IPAD_VAL); } void hmac_init_opad(const struct auth_hash *axf, const char *key, int klen, void *auth_ctx) { hmac_init_pad(axf, key, klen, auth_ctx, HMAC_OPAD_VAL); } static void crypto_destroy(void) { struct crypto_ret_worker *ret_worker; int i; /* * Terminate any crypto threads. */ if (crypto_tq != NULL) taskqueue_drain_all(crypto_tq); CRYPTO_DRIVER_LOCK(); crypto_terminate(&cryptotd, &crp_q); FOREACH_CRYPTO_RETW(ret_worker) crypto_terminate(&ret_worker->td, &ret_worker->crp_ret_q); CRYPTO_DRIVER_UNLOCK(); /* XXX flush queues??? */ /* * Reclaim dynamically allocated resources. */ for (i = 0; i < crypto_drivers_size; i++) { if (crypto_drivers[i] != NULL) cap_rele(crypto_drivers[i]); } free(crypto_drivers, M_CRYPTO_DATA); if (cryptop_zone != NULL) uma_zdestroy(cryptop_zone); mtx_destroy(&crypto_q_mtx); FOREACH_CRYPTO_RETW(ret_worker) mtx_destroy(&ret_worker->crypto_ret_mtx); free(crypto_ret_workers, M_CRYPTO_DATA); if (crypto_tq != NULL) taskqueue_free(crypto_tq); mtx_destroy(&crypto_drivers_mtx); } uint32_t crypto_ses2hid(crypto_session_t crypto_session) { return (crypto_session->cap->cc_hid); } uint32_t crypto_ses2caps(crypto_session_t crypto_session) { return (crypto_session->cap->cc_flags & 0xff000000); } void * crypto_get_driver_session(crypto_session_t crypto_session) { return (crypto_session + 1); } const struct crypto_session_params * crypto_get_params(crypto_session_t crypto_session) { return (&crypto_session->csp); } const struct auth_hash * crypto_auth_hash(const struct crypto_session_params *csp) { switch (csp->csp_auth_alg) { case CRYPTO_SHA1_HMAC: return (&auth_hash_hmac_sha1); case CRYPTO_SHA2_224_HMAC: return (&auth_hash_hmac_sha2_224); case CRYPTO_SHA2_256_HMAC: return (&auth_hash_hmac_sha2_256); case CRYPTO_SHA2_384_HMAC: return (&auth_hash_hmac_sha2_384); case CRYPTO_SHA2_512_HMAC: return (&auth_hash_hmac_sha2_512); case CRYPTO_NULL_HMAC: return (&auth_hash_null); case CRYPTO_RIPEMD160_HMAC: return (&auth_hash_hmac_ripemd_160); case CRYPTO_RIPEMD160: return (&auth_hash_ripemd_160); case CRYPTO_SHA1: return (&auth_hash_sha1); case CRYPTO_SHA2_224: return (&auth_hash_sha2_224); case CRYPTO_SHA2_256: return (&auth_hash_sha2_256); case CRYPTO_SHA2_384: return (&auth_hash_sha2_384); case CRYPTO_SHA2_512: return (&auth_hash_sha2_512); case CRYPTO_AES_NIST_GMAC: switch (csp->csp_auth_klen) { case 128 / 8: return (&auth_hash_nist_gmac_aes_128); case 192 / 8: return (&auth_hash_nist_gmac_aes_192); case 256 / 8: return (&auth_hash_nist_gmac_aes_256); default: return (NULL); } case CRYPTO_BLAKE2B: return (&auth_hash_blake2b); case CRYPTO_BLAKE2S: return (&auth_hash_blake2s); case CRYPTO_POLY1305: return (&auth_hash_poly1305); case CRYPTO_AES_CCM_CBC_MAC: switch (csp->csp_auth_klen) { case 128 / 8: return (&auth_hash_ccm_cbc_mac_128); case 192 / 8: return (&auth_hash_ccm_cbc_mac_192); case 256 / 8: return (&auth_hash_ccm_cbc_mac_256); default: return (NULL); } default: return (NULL); } } const struct enc_xform * crypto_cipher(const struct crypto_session_params *csp) { switch (csp->csp_cipher_alg) { case CRYPTO_AES_CBC: return (&enc_xform_aes_cbc); case CRYPTO_AES_XTS: return (&enc_xform_aes_xts); case CRYPTO_AES_ICM: return (&enc_xform_aes_icm); case CRYPTO_AES_NIST_GCM_16: return (&enc_xform_aes_nist_gcm); case CRYPTO_CAMELLIA_CBC: return (&enc_xform_camellia); case CRYPTO_NULL_CBC: return (&enc_xform_null); case CRYPTO_CHACHA20: return (&enc_xform_chacha20); case CRYPTO_AES_CCM_16: return (&enc_xform_ccm); case CRYPTO_CHACHA20_POLY1305: return (&enc_xform_chacha20_poly1305); case CRYPTO_XCHACHA20_POLY1305: return (&enc_xform_xchacha20_poly1305); default: return (NULL); } } static struct cryptocap * crypto_checkdriver(uint32_t hid) { return (hid >= crypto_drivers_size ? NULL : crypto_drivers[hid]); } /* * Select a driver for a new session that supports the specified * algorithms and, optionally, is constrained according to the flags. */ static struct cryptocap * crypto_select_driver(const struct crypto_session_params *csp, int flags) { struct cryptocap *cap, *best; int best_match, error, hid; CRYPTO_DRIVER_ASSERT(); best = NULL; for (hid = 0; hid < crypto_drivers_size; hid++) { /* * If there is no driver for this slot, or the driver * is not appropriate (hardware or software based on * match), then skip. */ cap = crypto_drivers[hid]; if (cap == NULL || (cap->cc_flags & flags) == 0) continue; error = CRYPTODEV_PROBESESSION(cap->cc_dev, csp); if (error >= 0) continue; /* * Use the driver with the highest probe value. * Hardware drivers use a higher probe value than * software. In case of a tie, prefer the driver with * the fewest active sessions. */ if (best == NULL || error > best_match || (error == best_match && cap->cc_sessions < best->cc_sessions)) { best = cap; best_match = error; } } return best; } static enum alg_type { ALG_NONE = 0, ALG_CIPHER, ALG_DIGEST, ALG_KEYED_DIGEST, ALG_COMPRESSION, ALG_AEAD } alg_types[] = { [CRYPTO_SHA1_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_RIPEMD160_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_AES_CBC] = ALG_CIPHER, [CRYPTO_SHA1] = ALG_DIGEST, [CRYPTO_NULL_HMAC] = ALG_DIGEST, [CRYPTO_NULL_CBC] = ALG_CIPHER, [CRYPTO_DEFLATE_COMP] = ALG_COMPRESSION, [CRYPTO_SHA2_256_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_SHA2_384_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_SHA2_512_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_CAMELLIA_CBC] = ALG_CIPHER, [CRYPTO_AES_XTS] = ALG_CIPHER, [CRYPTO_AES_ICM] = ALG_CIPHER, [CRYPTO_AES_NIST_GMAC] = ALG_KEYED_DIGEST, [CRYPTO_AES_NIST_GCM_16] = ALG_AEAD, [CRYPTO_BLAKE2B] = ALG_KEYED_DIGEST, [CRYPTO_BLAKE2S] = ALG_KEYED_DIGEST, [CRYPTO_CHACHA20] = ALG_CIPHER, [CRYPTO_SHA2_224_HMAC] = ALG_KEYED_DIGEST, [CRYPTO_RIPEMD160] = ALG_DIGEST, [CRYPTO_SHA2_224] = ALG_DIGEST, [CRYPTO_SHA2_256] = ALG_DIGEST, [CRYPTO_SHA2_384] = ALG_DIGEST, [CRYPTO_SHA2_512] = ALG_DIGEST, [CRYPTO_POLY1305] = ALG_KEYED_DIGEST, [CRYPTO_AES_CCM_CBC_MAC] = ALG_KEYED_DIGEST, [CRYPTO_AES_CCM_16] = ALG_AEAD, [CRYPTO_CHACHA20_POLY1305] = ALG_AEAD, [CRYPTO_XCHACHA20_POLY1305] = ALG_AEAD, }; static enum alg_type alg_type(int alg) { if (alg < nitems(alg_types)) return (alg_types[alg]); return (ALG_NONE); } static bool alg_is_compression(int alg) { return (alg_type(alg) == ALG_COMPRESSION); } static bool alg_is_cipher(int alg) { return (alg_type(alg) == ALG_CIPHER); } static bool alg_is_digest(int alg) { return (alg_type(alg) == ALG_DIGEST || alg_type(alg) == ALG_KEYED_DIGEST); } static bool alg_is_keyed_digest(int alg) { return (alg_type(alg) == ALG_KEYED_DIGEST); } static bool alg_is_aead(int alg) { return (alg_type(alg) == ALG_AEAD); } static bool ccm_tag_length_valid(int len) { /* RFC 3610 */ switch (len) { case 4: case 6: case 8: case 10: case 12: case 14: case 16: return (true); default: return (false); } } #define SUPPORTED_SES (CSP_F_SEPARATE_OUTPUT | CSP_F_SEPARATE_AAD | CSP_F_ESN) /* Various sanity checks on crypto session parameters. */ static bool check_csp(const struct crypto_session_params *csp) { const struct auth_hash *axf; /* Mode-independent checks. */ if ((csp->csp_flags & ~(SUPPORTED_SES)) != 0) return (false); if (csp->csp_ivlen < 0 || csp->csp_cipher_klen < 0 || csp->csp_auth_klen < 0 || csp->csp_auth_mlen < 0) return (false); if (csp->csp_auth_key != NULL && csp->csp_auth_klen == 0) return (false); if (csp->csp_cipher_key != NULL && csp->csp_cipher_klen == 0) return (false); switch (csp->csp_mode) { case CSP_MODE_COMPRESS: if (!alg_is_compression(csp->csp_cipher_alg)) return (false); if (csp->csp_flags & CSP_F_SEPARATE_OUTPUT) return (false); if (csp->csp_flags & CSP_F_SEPARATE_AAD) return (false); if (csp->csp_cipher_klen != 0 || csp->csp_ivlen != 0 || csp->csp_auth_alg != 0 || csp->csp_auth_klen != 0 || csp->csp_auth_mlen != 0) return (false); break; case CSP_MODE_CIPHER: if (!alg_is_cipher(csp->csp_cipher_alg)) return (false); if (csp->csp_flags & CSP_F_SEPARATE_AAD) return (false); if (csp->csp_cipher_alg != CRYPTO_NULL_CBC) { if (csp->csp_cipher_klen == 0) return (false); if (csp->csp_ivlen == 0) return (false); } if (csp->csp_ivlen >= EALG_MAX_BLOCK_LEN) return (false); if (csp->csp_auth_alg != 0 || csp->csp_auth_klen != 0 || csp->csp_auth_mlen != 0) return (false); break; case CSP_MODE_DIGEST: if (csp->csp_cipher_alg != 0 || csp->csp_cipher_klen != 0) return (false); if (csp->csp_flags & CSP_F_SEPARATE_AAD) return (false); /* IV is optional for digests (e.g. GMAC). */ switch (csp->csp_auth_alg) { case CRYPTO_AES_CCM_CBC_MAC: if (csp->csp_ivlen < 7 || csp->csp_ivlen > 13) return (false); break; case CRYPTO_AES_NIST_GMAC: if (csp->csp_ivlen != AES_GCM_IV_LEN) return (false); break; default: if (csp->csp_ivlen != 0) return (false); break; } if (!alg_is_digest(csp->csp_auth_alg)) return (false); /* Key is optional for BLAKE2 digests. */ if (csp->csp_auth_alg == CRYPTO_BLAKE2B || csp->csp_auth_alg == CRYPTO_BLAKE2S) ; else if (alg_is_keyed_digest(csp->csp_auth_alg)) { if (csp->csp_auth_klen == 0) return (false); } else { if (csp->csp_auth_klen != 0) return (false); } if (csp->csp_auth_mlen != 0) { axf = crypto_auth_hash(csp); if (axf == NULL || csp->csp_auth_mlen > axf->hashsize) return (false); if (csp->csp_auth_alg == CRYPTO_AES_CCM_CBC_MAC && !ccm_tag_length_valid(csp->csp_auth_mlen)) return (false); } break; case CSP_MODE_AEAD: if (!alg_is_aead(csp->csp_cipher_alg)) return (false); if (csp->csp_cipher_klen == 0) return (false); if (csp->csp_ivlen == 0 || csp->csp_ivlen >= EALG_MAX_BLOCK_LEN) return (false); if (csp->csp_auth_alg != 0 || csp->csp_auth_klen != 0) return (false); switch (csp->csp_cipher_alg) { case CRYPTO_AES_CCM_16: if (csp->csp_auth_mlen != 0 && !ccm_tag_length_valid(csp->csp_auth_mlen)) return (false); if (csp->csp_ivlen < 7 || csp->csp_ivlen > 13) return (false); break; case CRYPTO_AES_NIST_GCM_16: if (csp->csp_auth_mlen > AES_GMAC_HASH_LEN) return (false); if (csp->csp_ivlen != AES_GCM_IV_LEN) return (false); break; case CRYPTO_CHACHA20_POLY1305: if (csp->csp_ivlen != 8 && csp->csp_ivlen != 12) return (false); if (csp->csp_auth_mlen > POLY1305_HASH_LEN) return (false); break; case CRYPTO_XCHACHA20_POLY1305: if (csp->csp_ivlen != XCHACHA20_POLY1305_IV_LEN) return (false); if (csp->csp_auth_mlen > POLY1305_HASH_LEN) return (false); break; } break; case CSP_MODE_ETA: if (!alg_is_cipher(csp->csp_cipher_alg)) return (false); if (csp->csp_cipher_alg != CRYPTO_NULL_CBC) { if (csp->csp_cipher_klen == 0) return (false); if (csp->csp_ivlen == 0) return (false); } if (csp->csp_ivlen >= EALG_MAX_BLOCK_LEN) return (false); if (!alg_is_digest(csp->csp_auth_alg)) return (false); /* Key is optional for BLAKE2 digests. */ if (csp->csp_auth_alg == CRYPTO_BLAKE2B || csp->csp_auth_alg == CRYPTO_BLAKE2S) ; else if (alg_is_keyed_digest(csp->csp_auth_alg)) { if (csp->csp_auth_klen == 0) return (false); } else { if (csp->csp_auth_klen != 0) return (false); } if (csp->csp_auth_mlen != 0) { axf = crypto_auth_hash(csp); if (axf == NULL || csp->csp_auth_mlen > axf->hashsize) return (false); } break; default: return (false); } return (true); } /* * Delete a session after it has been detached from its driver. */ static void crypto_deletesession(crypto_session_t cses) { struct cryptocap *cap; cap = cses->cap; zfree(cses, M_CRYPTO_DATA); CRYPTO_DRIVER_LOCK(); cap->cc_sessions--; if (cap->cc_sessions == 0 && cap->cc_flags & CRYPTOCAP_F_CLEANUP) wakeup(cap); CRYPTO_DRIVER_UNLOCK(); cap_rele(cap); } /* * Create a new session. The crid argument specifies a crypto * driver to use or constraints on a driver to select (hardware * only, software only, either). Whatever driver is selected * must be capable of the requested crypto algorithms. */ int crypto_newsession(crypto_session_t *cses, const struct crypto_session_params *csp, int crid) { static uint64_t sessid = 0; crypto_session_t res; struct cryptocap *cap; int err; if (!check_csp(csp)) return (EINVAL); res = NULL; CRYPTO_DRIVER_LOCK(); if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) { /* * Use specified driver; verify it is capable. */ cap = crypto_checkdriver(crid); if (cap != NULL && CRYPTODEV_PROBESESSION(cap->cc_dev, csp) > 0) cap = NULL; } else { /* * No requested driver; select based on crid flags. */ cap = crypto_select_driver(csp, crid); } if (cap == NULL) { CRYPTO_DRIVER_UNLOCK(); CRYPTDEB("no driver"); return (EOPNOTSUPP); } cap_ref(cap); cap->cc_sessions++; CRYPTO_DRIVER_UNLOCK(); /* Allocate a single block for the generic session and driver softc. */ res = malloc(sizeof(*res) + cap->cc_session_size, M_CRYPTO_DATA, M_WAITOK | M_ZERO); res->cap = cap; res->csp = *csp; res->id = atomic_fetchadd_64(&sessid, 1); /* Call the driver initialization routine. */ err = CRYPTODEV_NEWSESSION(cap->cc_dev, res, csp); if (err != 0) { CRYPTDEB("dev newsession failed: %d", err); crypto_deletesession(res); return (err); } *cses = res; return (0); } /* * Delete an existing session (or a reserved session on an unregistered * driver). */ void crypto_freesession(crypto_session_t cses) { struct cryptocap *cap; if (cses == NULL) return; cap = cses->cap; /* Call the driver cleanup routine, if available. */ CRYPTODEV_FREESESSION(cap->cc_dev, cses); crypto_deletesession(cses); } /* * Return a new driver id. Registers a driver with the system so that * it can be probed by subsequent sessions. */ int32_t crypto_get_driverid(device_t dev, size_t sessionsize, int flags) { struct cryptocap *cap, **newdrv; int i; if ((flags & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) { device_printf(dev, "no flags specified when registering driver\n"); return -1; } cap = malloc(sizeof(*cap), M_CRYPTO_DATA, M_WAITOK | M_ZERO); cap->cc_dev = dev; cap->cc_session_size = sessionsize; cap->cc_flags = flags; refcount_init(&cap->cc_refs, 1); CRYPTO_DRIVER_LOCK(); for (;;) { for (i = 0; i < crypto_drivers_size; i++) { if (crypto_drivers[i] == NULL) break; } if (i < crypto_drivers_size) break; /* Out of entries, allocate some more. */ if (2 * crypto_drivers_size <= crypto_drivers_size) { CRYPTO_DRIVER_UNLOCK(); printf("crypto: driver count wraparound!\n"); cap_rele(cap); return (-1); } CRYPTO_DRIVER_UNLOCK(); newdrv = malloc(2 * crypto_drivers_size * sizeof(*crypto_drivers), M_CRYPTO_DATA, M_WAITOK | M_ZERO); CRYPTO_DRIVER_LOCK(); memcpy(newdrv, crypto_drivers, crypto_drivers_size * sizeof(*crypto_drivers)); crypto_drivers_size *= 2; free(crypto_drivers, M_CRYPTO_DATA); crypto_drivers = newdrv; } cap->cc_hid = i; crypto_drivers[i] = cap; CRYPTO_DRIVER_UNLOCK(); if (bootverbose) printf("crypto: assign %s driver id %u, flags 0x%x\n", device_get_nameunit(dev), i, flags); return i; } /* * Lookup a driver by name. We match against the full device * name and unit, and against just the name. The latter gives * us a simple widlcarding by device name. On success return the * driver/hardware identifier; otherwise return -1. */ int crypto_find_driver(const char *match) { struct cryptocap *cap; int i, len = strlen(match); CRYPTO_DRIVER_LOCK(); for (i = 0; i < crypto_drivers_size; i++) { if (crypto_drivers[i] == NULL) continue; cap = crypto_drivers[i]; if (strncmp(match, device_get_nameunit(cap->cc_dev), len) == 0 || strncmp(match, device_get_name(cap->cc_dev), len) == 0) { CRYPTO_DRIVER_UNLOCK(); return (i); } } CRYPTO_DRIVER_UNLOCK(); return (-1); } /* * Return the device_t for the specified driver or NULL * if the driver identifier is invalid. */ device_t crypto_find_device_byhid(int hid) { struct cryptocap *cap; device_t dev; dev = NULL; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(hid); if (cap != NULL) dev = cap->cc_dev; CRYPTO_DRIVER_UNLOCK(); return (dev); } /* * Return the device/driver capabilities. */ int crypto_getcaps(int hid) { struct cryptocap *cap; int flags; flags = 0; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(hid); if (cap != NULL) flags = cap->cc_flags; CRYPTO_DRIVER_UNLOCK(); return (flags); } /* * Unregister all algorithms associated with a crypto driver. * If there are pending sessions using it, leave enough information * around so that subsequent calls using those sessions will * correctly detect the driver has been unregistered and reroute * requests. */ int crypto_unregister_all(uint32_t driverid) { struct cryptocap *cap; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(driverid); if (cap == NULL) { CRYPTO_DRIVER_UNLOCK(); return (EINVAL); } cap->cc_flags |= CRYPTOCAP_F_CLEANUP; crypto_drivers[driverid] = NULL; /* * XXX: This doesn't do anything to kick sessions that * have no pending operations. */ while (cap->cc_sessions != 0) mtx_sleep(cap, &crypto_drivers_mtx, 0, "cryunreg", 0); CRYPTO_DRIVER_UNLOCK(); cap_rele(cap); return (0); } /* * Clear blockage on a driver. The what parameter indicates whether * the driver is now ready for cryptop's and/or cryptokop's. */ int crypto_unblock(uint32_t driverid, int what) { struct cryptocap *cap; int err; CRYPTO_Q_LOCK(); cap = crypto_checkdriver(driverid); if (cap != NULL) { if (what & CRYPTO_SYMQ) cap->cc_qblocked = 0; if (crp_sleep) wakeup_one(&crp_q); err = 0; } else err = EINVAL; CRYPTO_Q_UNLOCK(); return err; } size_t crypto_buffer_len(struct crypto_buffer *cb) { switch (cb->cb_type) { case CRYPTO_BUF_CONTIG: return (cb->cb_buf_len); case CRYPTO_BUF_MBUF: if (cb->cb_mbuf->m_flags & M_PKTHDR) return (cb->cb_mbuf->m_pkthdr.len); return (m_length(cb->cb_mbuf, NULL)); case CRYPTO_BUF_SINGLE_MBUF: return (cb->cb_mbuf->m_len); case CRYPTO_BUF_VMPAGE: return (cb->cb_vm_page_len); case CRYPTO_BUF_UIO: return (cb->cb_uio->uio_resid); default: return (0); } } #ifdef INVARIANTS /* Various sanity checks on crypto requests. */ static void cb_sanity(struct crypto_buffer *cb, const char *name) { KASSERT(cb->cb_type > CRYPTO_BUF_NONE && cb->cb_type <= CRYPTO_BUF_LAST, ("incoming crp with invalid %s buffer type", name)); switch (cb->cb_type) { case CRYPTO_BUF_CONTIG: KASSERT(cb->cb_buf_len >= 0, ("incoming crp with -ve %s buffer length", name)); break; case CRYPTO_BUF_VMPAGE: KASSERT(CRYPTO_HAS_VMPAGE, ("incoming crp uses dmap on supported arch")); KASSERT(cb->cb_vm_page_len >= 0, ("incoming crp with -ve %s buffer length", name)); KASSERT(cb->cb_vm_page_offset >= 0, ("incoming crp with -ve %s buffer offset", name)); KASSERT(cb->cb_vm_page_offset < PAGE_SIZE, ("incoming crp with %s buffer offset greater than page size" , name)); break; default: break; } } static void crp_sanity(struct cryptop *crp) { struct crypto_session_params *csp; struct crypto_buffer *out; size_t ilen, len, olen; KASSERT(crp->crp_session != NULL, ("incoming crp without a session")); KASSERT(crp->crp_obuf.cb_type >= CRYPTO_BUF_NONE && crp->crp_obuf.cb_type <= CRYPTO_BUF_LAST, ("incoming crp with invalid output buffer type")); KASSERT(crp->crp_etype == 0, ("incoming crp with error")); KASSERT(!(crp->crp_flags & CRYPTO_F_DONE), ("incoming crp already done")); csp = &crp->crp_session->csp; cb_sanity(&crp->crp_buf, "input"); ilen = crypto_buffer_len(&crp->crp_buf); olen = ilen; out = NULL; if (csp->csp_flags & CSP_F_SEPARATE_OUTPUT) { if (crp->crp_obuf.cb_type != CRYPTO_BUF_NONE) { cb_sanity(&crp->crp_obuf, "output"); out = &crp->crp_obuf; olen = crypto_buffer_len(out); } } else KASSERT(crp->crp_obuf.cb_type == CRYPTO_BUF_NONE, ("incoming crp with separate output buffer " "but no session support")); switch (csp->csp_mode) { case CSP_MODE_COMPRESS: KASSERT(crp->crp_op == CRYPTO_OP_COMPRESS || crp->crp_op == CRYPTO_OP_DECOMPRESS, ("invalid compression op %x", crp->crp_op)); break; case CSP_MODE_CIPHER: KASSERT(crp->crp_op == CRYPTO_OP_ENCRYPT || crp->crp_op == CRYPTO_OP_DECRYPT, ("invalid cipher op %x", crp->crp_op)); break; case CSP_MODE_DIGEST: KASSERT(crp->crp_op == CRYPTO_OP_COMPUTE_DIGEST || crp->crp_op == CRYPTO_OP_VERIFY_DIGEST, ("invalid digest op %x", crp->crp_op)); break; case CSP_MODE_AEAD: KASSERT(crp->crp_op == (CRYPTO_OP_ENCRYPT | CRYPTO_OP_COMPUTE_DIGEST) || crp->crp_op == (CRYPTO_OP_DECRYPT | CRYPTO_OP_VERIFY_DIGEST), ("invalid AEAD op %x", crp->crp_op)); KASSERT(crp->crp_flags & CRYPTO_F_IV_SEPARATE, ("AEAD without a separate IV")); break; case CSP_MODE_ETA: KASSERT(crp->crp_op == (CRYPTO_OP_ENCRYPT | CRYPTO_OP_COMPUTE_DIGEST) || crp->crp_op == (CRYPTO_OP_DECRYPT | CRYPTO_OP_VERIFY_DIGEST), ("invalid ETA op %x", crp->crp_op)); break; } if (csp->csp_mode == CSP_MODE_AEAD || csp->csp_mode == CSP_MODE_ETA) { if (crp->crp_aad == NULL) { KASSERT(crp->crp_aad_start == 0 || crp->crp_aad_start < ilen, ("invalid AAD start")); KASSERT(crp->crp_aad_length != 0 || crp->crp_aad_start == 0, ("AAD with zero length and non-zero start")); KASSERT(crp->crp_aad_length == 0 || crp->crp_aad_start + crp->crp_aad_length <= ilen, ("AAD outside input length")); } else { KASSERT(csp->csp_flags & CSP_F_SEPARATE_AAD, ("session doesn't support separate AAD buffer")); KASSERT(crp->crp_aad_start == 0, ("separate AAD buffer with non-zero AAD start")); KASSERT(crp->crp_aad_length != 0, ("separate AAD buffer with zero length")); } } else { KASSERT(crp->crp_aad == NULL && crp->crp_aad_start == 0 && crp->crp_aad_length == 0, ("AAD region in request not supporting AAD")); } if (csp->csp_ivlen == 0) { KASSERT((crp->crp_flags & CRYPTO_F_IV_SEPARATE) == 0, ("IV_SEPARATE set when IV isn't used")); KASSERT(crp->crp_iv_start == 0, ("crp_iv_start set when IV isn't used")); } else if (crp->crp_flags & CRYPTO_F_IV_SEPARATE) { KASSERT(crp->crp_iv_start == 0, ("IV_SEPARATE used with non-zero IV start")); } else { KASSERT(crp->crp_iv_start < ilen, ("invalid IV start")); KASSERT(crp->crp_iv_start + csp->csp_ivlen <= ilen, ("IV outside buffer length")); } /* XXX: payload_start of 0 should always be < ilen? */ KASSERT(crp->crp_payload_start == 0 || crp->crp_payload_start < ilen, ("invalid payload start")); KASSERT(crp->crp_payload_start + crp->crp_payload_length <= ilen, ("payload outside input buffer")); if (out == NULL) { KASSERT(crp->crp_payload_output_start == 0, ("payload output start non-zero without output buffer")); } else if (csp->csp_mode == CSP_MODE_DIGEST) { KASSERT(!(crp->crp_op & CRYPTO_OP_VERIFY_DIGEST), ("digest verify with separate output buffer")); KASSERT(crp->crp_payload_output_start == 0, ("digest operation with non-zero payload output start")); } else { KASSERT(crp->crp_payload_output_start == 0 || crp->crp_payload_output_start < olen, ("invalid payload output start")); KASSERT(crp->crp_payload_output_start + crp->crp_payload_length <= olen, ("payload outside output buffer")); } if (csp->csp_mode == CSP_MODE_DIGEST || csp->csp_mode == CSP_MODE_AEAD || csp->csp_mode == CSP_MODE_ETA) { if (crp->crp_op & CRYPTO_OP_VERIFY_DIGEST) len = ilen; else len = olen; KASSERT(crp->crp_digest_start == 0 || crp->crp_digest_start < len, ("invalid digest start")); /* XXX: For the mlen == 0 case this check isn't perfect. */ KASSERT(crp->crp_digest_start + csp->csp_auth_mlen <= len, ("digest outside buffer")); } else { KASSERT(crp->crp_digest_start == 0, ("non-zero digest start for request without a digest")); } if (csp->csp_cipher_klen != 0) KASSERT(csp->csp_cipher_key != NULL || crp->crp_cipher_key != NULL, ("cipher request without a key")); if (csp->csp_auth_klen != 0) KASSERT(csp->csp_auth_key != NULL || crp->crp_auth_key != NULL, ("auth request without a key")); KASSERT(crp->crp_callback != NULL, ("incoming crp without callback")); } #endif static int crypto_dispatch_one(struct cryptop *crp, int hint) { struct cryptocap *cap; int result; #ifdef INVARIANTS crp_sanity(crp); #endif CRYPTOSTAT_INC(cs_ops); crp->crp_retw_id = crp->crp_session->id % crypto_workers_num; /* * Caller marked the request to be processed immediately; dispatch it * directly to the driver unless the driver is currently blocked, in * which case it is queued for deferred dispatch. */ cap = crp->crp_session->cap; if (!atomic_load_int(&cap->cc_qblocked)) { result = crypto_invoke(cap, crp, hint); if (result != ERESTART) return (result); /* * The driver ran out of resources, put the request on the * queue. */ } crypto_batch_enqueue(crp); return (0); } int crypto_dispatch(struct cryptop *crp) { return (crypto_dispatch_one(crp, 0)); } int crypto_dispatch_async(struct cryptop *crp, int flags) { struct crypto_ret_worker *ret_worker; if (!CRYPTO_SESS_SYNC(crp->crp_session)) { /* * The driver issues completions asynchonously, don't bother * deferring dispatch to a worker thread. */ return (crypto_dispatch(crp)); } #ifdef INVARIANTS crp_sanity(crp); #endif CRYPTOSTAT_INC(cs_ops); crp->crp_retw_id = crp->crp_session->id % crypto_workers_num; if ((flags & CRYPTO_ASYNC_ORDERED) != 0) { crp->crp_flags |= CRYPTO_F_ASYNC_ORDERED; ret_worker = CRYPTO_RETW(crp->crp_retw_id); CRYPTO_RETW_LOCK(ret_worker); crp->crp_seq = ret_worker->reorder_ops++; CRYPTO_RETW_UNLOCK(ret_worker); } TASK_INIT(&crp->crp_task, 0, crypto_task_invoke, crp); taskqueue_enqueue(crypto_tq, &crp->crp_task); return (0); } void crypto_dispatch_batch(struct cryptopq *crpq, int flags) { struct cryptop *crp; int hint; while ((crp = TAILQ_FIRST(crpq)) != NULL) { hint = TAILQ_NEXT(crp, crp_next) != NULL ? CRYPTO_HINT_MORE : 0; TAILQ_REMOVE(crpq, crp, crp_next); if (crypto_dispatch_one(crp, hint) != 0) crypto_batch_enqueue(crp); } } static void crypto_batch_enqueue(struct cryptop *crp) { CRYPTO_Q_LOCK(); TAILQ_INSERT_TAIL(&crp_q, crp, crp_next); if (crp_sleep) wakeup_one(&crp_q); CRYPTO_Q_UNLOCK(); } static void crypto_task_invoke(void *ctx, int pending) { struct cryptocap *cap; struct cryptop *crp; int result; crp = (struct cryptop *)ctx; cap = crp->crp_session->cap; result = crypto_invoke(cap, crp, 0); if (result == ERESTART) crypto_batch_enqueue(crp); } /* * Dispatch a crypto request to the appropriate crypto devices. */ static int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint) { int error; KASSERT(crp != NULL, ("%s: crp == NULL", __func__)); KASSERT(crp->crp_callback != NULL, ("%s: crp->crp_callback == NULL", __func__)); KASSERT(crp->crp_session != NULL, ("%s: crp->crp_session == NULL", __func__)); if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) { struct crypto_session_params csp; crypto_session_t nses; /* * Driver has unregistered; migrate the session and return * an error to the caller so they'll resubmit the op. * * XXX: What if there are more already queued requests for this * session? * * XXX: Real solution is to make sessions refcounted * and force callers to hold a reference when * assigning to crp_session. Could maybe change * crypto_getreq to accept a session pointer to make * that work. Alternatively, we could abandon the * notion of rewriting crp_session in requests forcing * the caller to deal with allocating a new session. * Perhaps provide a method to allow a crp's session to * be swapped that callers could use. */ csp = crp->crp_session->csp; crypto_freesession(crp->crp_session); /* * XXX: Key pointers may no longer be valid. If we * really want to support this we need to define the * KPI such that 'csp' is required to be valid for the * duration of a session by the caller perhaps. * * XXX: If the keys have been changed this will reuse * the old keys. This probably suggests making * rekeying more explicit and updating the key * pointers in 'csp' when the keys change. */ if (crypto_newsession(&nses, &csp, CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE) == 0) crp->crp_session = nses; crp->crp_etype = EAGAIN; crypto_done(crp); error = 0; } else { /* * Invoke the driver to process the request. Errors are * signaled by setting crp_etype before invoking the completion * callback. */ error = CRYPTODEV_PROCESS(cap->cc_dev, crp, hint); KASSERT(error == 0 || error == ERESTART, ("%s: invalid error %d from CRYPTODEV_PROCESS", __func__, error)); } return (error); } void crypto_destroyreq(struct cryptop *crp) { #ifdef DIAGNOSTIC { struct cryptop *crp2; struct crypto_ret_worker *ret_worker; if (!crypto_destroyreq_check) return; CRYPTO_Q_LOCK(); TAILQ_FOREACH(crp2, &crp_q, crp_next) { KASSERT(crp2 != crp, ("Freeing cryptop from the crypto queue (%p).", crp)); } CRYPTO_Q_UNLOCK(); FOREACH_CRYPTO_RETW(ret_worker) { CRYPTO_RETW_LOCK(ret_worker); TAILQ_FOREACH(crp2, &ret_worker->crp_ret_q, crp_next) { KASSERT(crp2 != crp, ("Freeing cryptop from the return queue (%p).", crp)); } CRYPTO_RETW_UNLOCK(ret_worker); } } #endif } void crypto_freereq(struct cryptop *crp) { if (crp == NULL) return; crypto_destroyreq(crp); uma_zfree(cryptop_zone, crp); } static void _crypto_initreq(struct cryptop *crp, crypto_session_t cses) { crp->crp_session = cses; } void crypto_initreq(struct cryptop *crp, crypto_session_t cses) { memset(crp, 0, sizeof(*crp)); _crypto_initreq(crp, cses); } struct cryptop * crypto_getreq(crypto_session_t cses, int how) { struct cryptop *crp; MPASS(how == M_WAITOK || how == M_NOWAIT); crp = uma_zalloc(cryptop_zone, how | M_ZERO); if (crp != NULL) _crypto_initreq(crp, cses); return (crp); } /* * Clone a crypto request, but associate it with the specified session * rather than inheriting the session from the original request. The * fields describing the request buffers are copied, but not the * opaque field or callback function. */ struct cryptop * crypto_clonereq(struct cryptop *crp, crypto_session_t cses, int how) { struct cryptop *new; MPASS((crp->crp_flags & CRYPTO_F_DONE) == 0); new = crypto_getreq(cses, how); if (new == NULL) return (NULL); memcpy(&new->crp_startcopy, &crp->crp_startcopy, __rangeof(struct cryptop, crp_startcopy, crp_endcopy)); return (new); } /* * Invoke the callback on behalf of the driver. */ void crypto_done(struct cryptop *crp) { KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0, ("crypto_done: op already done, flags 0x%x", crp->crp_flags)); crp->crp_flags |= CRYPTO_F_DONE; if (crp->crp_etype != 0) CRYPTOSTAT_INC(cs_errs); /* * CBIMM means unconditionally do the callback immediately; * CBIFSYNC means do the callback immediately only if the * operation was done synchronously. Both are used to avoid * doing extraneous context switches; the latter is mostly * used with the software crypto driver. */ if ((crp->crp_flags & CRYPTO_F_ASYNC_ORDERED) == 0 && ((crp->crp_flags & CRYPTO_F_CBIMM) != 0 || ((crp->crp_flags & CRYPTO_F_CBIFSYNC) != 0 && CRYPTO_SESS_SYNC(crp->crp_session)))) { /* * Do the callback directly. This is ok when the * callback routine does very little (e.g. the * /dev/crypto callback method just does a wakeup). */ crp->crp_callback(crp); } else { struct crypto_ret_worker *ret_worker; bool wake; ret_worker = CRYPTO_RETW(crp->crp_retw_id); /* * Normal case; queue the callback for the thread. */ CRYPTO_RETW_LOCK(ret_worker); if ((crp->crp_flags & CRYPTO_F_ASYNC_ORDERED) != 0) { struct cryptop *tmp; TAILQ_FOREACH_REVERSE(tmp, &ret_worker->crp_ordered_ret_q, cryptop_q, crp_next) { if (CRYPTO_SEQ_GT(crp->crp_seq, tmp->crp_seq)) { TAILQ_INSERT_AFTER( &ret_worker->crp_ordered_ret_q, tmp, crp, crp_next); break; } } if (tmp == NULL) { TAILQ_INSERT_HEAD( &ret_worker->crp_ordered_ret_q, crp, crp_next); } wake = crp->crp_seq == ret_worker->reorder_cur_seq; } else { wake = TAILQ_EMPTY(&ret_worker->crp_ret_q); TAILQ_INSERT_TAIL(&ret_worker->crp_ret_q, crp, crp_next); } if (wake) wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */ CRYPTO_RETW_UNLOCK(ret_worker); } } /* * Terminate a thread at module unload. The process that * initiated this is waiting for us to signal that we're gone; * wake it up and exit. We use the driver table lock to insure * we don't do the wakeup before they're waiting. There is no * race here because the waiter sleeps on the proc lock for the * thread so it gets notified at the right time because of an * extra wakeup that's done in exit1(). */ static void crypto_finis(void *chan) { CRYPTO_DRIVER_LOCK(); wakeup_one(chan); CRYPTO_DRIVER_UNLOCK(); kthread_exit(); } /* * Crypto thread, dispatches crypto requests. */ static void crypto_dispatch_thread(void *arg __unused) { struct cryptop *crp, *submit; struct cryptocap *cap; int result, hint; #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) fpu_kern_thread(FPU_KERN_NORMAL); #endif CRYPTO_Q_LOCK(); for (;;) { /* * Find the first element in the queue that can be * processed and look-ahead to see if multiple ops * are ready for the same driver. */ submit = NULL; hint = 0; TAILQ_FOREACH(crp, &crp_q, crp_next) { cap = crp->crp_session->cap; /* * Driver cannot disappeared when there is an active * session. */ KASSERT(cap != NULL, ("%s:%u Driver disappeared.", __func__, __LINE__)); if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) { /* Op needs to be migrated, process it. */ if (submit == NULL) submit = crp; break; } if (!cap->cc_qblocked) { if (submit != NULL) { /* * We stop on finding another op, * regardless whether its for the same * driver or not. We could keep * searching the queue but it might be * better to just use a per-driver * queue instead. */ if (submit->crp_session->cap == cap) hint = CRYPTO_HINT_MORE; } else { submit = crp; } break; } } if (submit != NULL) { TAILQ_REMOVE(&crp_q, submit, crp_next); cap = submit->crp_session->cap; KASSERT(cap != NULL, ("%s:%u Driver disappeared.", __func__, __LINE__)); CRYPTO_Q_UNLOCK(); result = crypto_invoke(cap, submit, hint); CRYPTO_Q_LOCK(); if (result == ERESTART) { /* * The driver ran out of resources, mark the * driver ``blocked'' for cryptop's and put * the request back in the queue. It would * best to put the request back where we got * it but that's hard so for now we put it * at the front. This should be ok; putting * it at the end does not work. */ cap->cc_qblocked = 1; TAILQ_INSERT_HEAD(&crp_q, submit, crp_next); CRYPTOSTAT_INC(cs_blocks); } } else { /* * Nothing more to be processed. Sleep until we're * woken because there are more ops to process. * This happens either by submission or by a driver * becoming unblocked and notifying us through * crypto_unblock. Note that when we wakeup we * start processing each queue again from the * front. It's not clear that it's important to * preserve this ordering since ops may finish * out of order if dispatched to different devices * and some become blocked while others do not. */ crp_sleep = 1; msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0); crp_sleep = 0; if (cryptotd == NULL) break; CRYPTOSTAT_INC(cs_intrs); } } CRYPTO_Q_UNLOCK(); crypto_finis(&crp_q); } /* * Crypto returns thread, does callbacks for processed crypto requests. * Callbacks are done here, rather than in the crypto drivers, because * callbacks typically are expensive and would slow interrupt handling. */ static void crypto_ret_thread(void *arg) { struct crypto_ret_worker *ret_worker = arg; struct cryptop *crpt; CRYPTO_RETW_LOCK(ret_worker); for (;;) { /* Harvest return q's for completed ops */ crpt = TAILQ_FIRST(&ret_worker->crp_ordered_ret_q); if (crpt != NULL) { if (crpt->crp_seq == ret_worker->reorder_cur_seq) { TAILQ_REMOVE(&ret_worker->crp_ordered_ret_q, crpt, crp_next); ret_worker->reorder_cur_seq++; } else { crpt = NULL; } } if (crpt == NULL) { crpt = TAILQ_FIRST(&ret_worker->crp_ret_q); if (crpt != NULL) TAILQ_REMOVE(&ret_worker->crp_ret_q, crpt, crp_next); } if (crpt != NULL) { CRYPTO_RETW_UNLOCK(ret_worker); /* * Run callbacks unlocked. */ if (crpt != NULL) crpt->crp_callback(crpt); CRYPTO_RETW_LOCK(ret_worker); } else { /* * Nothing more to be processed. Sleep until we're * woken because there are more returns to process. */ msleep(&ret_worker->crp_ret_q, &ret_worker->crypto_ret_mtx, PWAIT, "crypto_ret_wait", 0); if (ret_worker->td == NULL) break; CRYPTOSTAT_INC(cs_rets); } } CRYPTO_RETW_UNLOCK(ret_worker); crypto_finis(&ret_worker->crp_ret_q); } #ifdef DDB static void db_show_drivers(void) { int hid; db_printf("%12s %4s %8s %2s\n" , "Device" , "Ses" , "Flags" , "QB" ); for (hid = 0; hid < crypto_drivers_size; hid++) { const struct cryptocap *cap = crypto_drivers[hid]; if (cap == NULL) continue; db_printf("%-12s %4u %08x %2u\n" , device_get_nameunit(cap->cc_dev) , cap->cc_sessions , cap->cc_flags , cap->cc_qblocked ); } } -DB_SHOW_COMMAND(crypto, db_show_crypto) +DB_SHOW_COMMAND_FLAGS(crypto, db_show_crypto, DB_CMD_MEMSAFE) { struct cryptop *crp; struct crypto_ret_worker *ret_worker; db_show_drivers(); db_printf("\n"); db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n", "HID", "Caps", "Ilen", "Olen", "Etype", "Flags", "Device", "Callback"); TAILQ_FOREACH(crp, &crp_q, crp_next) { db_printf("%4u %08x %4u %4u %04x %8p %8p\n" , crp->crp_session->cap->cc_hid , (int) crypto_ses2caps(crp->crp_session) , crp->crp_olen , crp->crp_etype , crp->crp_flags , device_get_nameunit(crp->crp_session->cap->cc_dev) , crp->crp_callback ); } FOREACH_CRYPTO_RETW(ret_worker) { db_printf("\n%8s %4s %4s %4s %8s\n", "ret_worker", "HID", "Etype", "Flags", "Callback"); if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) { TAILQ_FOREACH(crp, &ret_worker->crp_ret_q, crp_next) { db_printf("%8td %4u %4u %04x %8p\n" , CRYPTO_RETW_ID(ret_worker) , crp->crp_session->cap->cc_hid , crp->crp_etype , crp->crp_flags , crp->crp_callback ); } } } } #endif int crypto_modevent(module_t mod, int type, void *unused); /* * Initialization code, both for static and dynamic loading. * Note this is not invoked with the usual MODULE_DECLARE * mechanism but instead is listed as a dependency by the * cryptosoft driver. This guarantees proper ordering of * calls on module load/unload. */ int crypto_modevent(module_t mod, int type, void *unused) { int error = EINVAL; switch (type) { case MOD_LOAD: error = crypto_init(); if (error == 0 && bootverbose) printf("crypto: \n"); break; case MOD_UNLOAD: /*XXX disallow if active sessions */ error = 0; crypto_destroy(); return 0; } return error; } MODULE_VERSION(crypto, 1); MODULE_DEPEND(crypto, zlib, 1, 1, 1); diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index ba8e60e66f1f..1afcb45c4d2b 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -1,5919 +1,5919 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2019 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #include #ifdef INVARIANTS #define UMA_ALWAYS_CTORDTOR 1 #else #define UMA_ALWAYS_CTORDTOR 0 #endif /* * This is the zone and keg from which all zones are spawned. */ static uma_zone_t kegs; static uma_zone_t zones; /* * On INVARIANTS builds, the slab contains a second bitset of the same size, * "dbg_bits", which is laid out immediately after us_free. */ #ifdef INVARIANTS #define SLAB_BITSETS 2 #else #define SLAB_BITSETS 1 #endif /* * These are the two zones from which all offpage uma_slab_ts are allocated. * * One zone is for slab headers that can represent a larger number of items, * making the slabs themselves more efficient, and the other zone is for * headers that are smaller and represent fewer items, making the headers more * efficient. */ #define SLABZONE_SIZE(setsize) \ (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) static uma_zone_t slabzones[2]; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of * zones. */ static struct rwlock_padalign __exclusive_cache_line uma_rwlock; static struct sx uma_reclaim_lock; /* * First available virual address for boot time allocations. */ static vm_offset_t bootstart; static vm_offset_t bootmem; /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early * allocations don't trigger a wakeup of the reclaim thread. */ unsigned long uma_kmem_limit = LONG_MAX; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, "UMA kernel memory soft limit"); unsigned long uma_kmem_total; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, "UMA kernel memory usage"); /* Is the VM done starting up? */ static enum { BOOT_COLD, BOOT_KVA, BOOT_PCPU, BOOT_RUNNING, BOOT_SHUTDOWN, } booted = BOOT_COLD; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct timeout_task uma_timeout_task; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; const char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) struct uma_bucket_zone bucket_zones[] = { /* Literal bucket sizes. */ { NULL, "2 Bucket", 2, 4096 }, { NULL, "4 Bucket", 4, 3072 }, { NULL, "8 Bucket", 8, 2048 }, { NULL, "16 Bucket", 16, 1024 }, /* Rounded down power of 2 sizes for efficiency. */ { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_CNT = 0x00000001, SKIP_DTOR = 0x00010000, SKIP_FINI = 0x00020000, }; /* Prototypes.. */ void uma_startup1(vm_offset_t); void uma_startup2(void); static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_reclaim(uma_zone_t zone, bool, int); static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static void keg_drain(uma_keg_t keg, int domain); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip); static int zero_init(void *, int, int); static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws); static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *); static void zone_timeout(uma_zone_t zone, void *); static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *, int); static void uma_shutdown(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static int zone_alloc_limit(uma_zone_t zone, int count, int flags); static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(int domain); static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static size_t slab_sizeof(int nitems); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(void *, void **, int, int, int); static void zone_release(void *, void **, int); static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); static bool cache_free(uma_zone_t, uma_cache_t, void *, int); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); static uint64_t uma_zone_get_allocs(uma_zone_t zone); static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Memory allocation debugging"); #ifdef INVARIANTS static uint64_t uma_keg_get_allocs(uma_keg_t zone); static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); static bool uma_dbg_kskip(uma_keg_t keg, void *mem); static bool uma_dbg_zskip(uma_zone_t zone, void *mem); static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); static u_int dbg_divisor = 1; SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, "Debug & thrash every this item in memory allocator"); static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; static counter_u64_t uma_skip_cnt = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, &uma_dbg_cnt, "memory items debugged"); SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, &uma_skip_cnt, "memory items skipped, not debugged"); #endif SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Universal Memory Allocator"); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); static int multipage_slabs = 1; TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs); SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0, "UMA may choose larger slab sizes for better efficiency"); /* * Select the slab zone for an offpage slab with the given maximum item count. */ static inline uma_zone_t slabzone(int ipers) { return (slabzones[ipers > SLABZONE0_SETSIZE]); } /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { KASSERT(booted >= BOOT_KVA, ("Bucket enable before init")); bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_FIRSTTOUCH); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * Don't allocate buckets early in boot. */ if (__predict_false(booted < BOOT_KVA)) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if (((uintptr_t)udata & UMA_ZONE_VM) != 0) flags |= M_NOVM; ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size)); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = min(ubz->ubz_entries, zone->uz_bucket_size_max); bucket->ub_seq = SMR_SEQ_INVALID; CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p", zone->uz_name, zone, bucket); } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; if (bucket->ub_cnt != 0) bucket_drain(zone, bucket); KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); KASSERT(bucket->ub_seq == SMR_SEQ_INVALID, ("bucket_free: Freeing an SMR bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(int domain) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN, domain); } #ifdef KASAN _Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0, "Base UMA allocation size not a multiple of the KASAN scale factor"); static void kasan_mark_item_valid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz, rsz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = zone->uz_size; rsz = roundup2(sz, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz, KASAN_GENERIC_REDZONE); } } static void kasan_mark_item_invalid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, 0, sz, KASAN_UMA_FREED); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz, KASAN_UMA_FREED); } } static void kasan_mark_slab_valid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { sz = keg->uk_ppera * PAGE_SIZE; kasan_mark(mem, sz, sz, 0); } } static void kasan_mark_slab_invalid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) sz = keg->uk_ppera * PAGE_SIZE; else sz = keg->uk_pgoff; kasan_mark(mem, 0, sz, KASAN_UMA_FREED); } } #else /* !KASAN */ static void kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused) { } static void kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused) { } #endif /* KASAN */ #ifdef KMSAN static inline void kmsan_mark_item_uninitialized(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & (UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) { /* * Cache zones should not be instrumented by default, as UMA * does not have enough information to do so correctly. * Consumers can mark items themselves if it makes sense to do * so. * * Items from secondary zones are initialized by the parent * zone and thus cannot safely be marked by UMA. * * malloc zones are handled directly by malloc(9) and friends, * since they can provide more precise origin tracking. */ return; } if (zone->uz_keg->uk_init != NULL) { /* * By definition, initialized items cannot be marked. The * best we can do is mark items from these zones after they * are freed to the keg. */ return; } sz = zone->uz_size; if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(item, sz, KMSAN_STATE_UNINIT); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) { kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_STATE_INITED); } } } #else /* !KMSAN */ static inline void kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused) { } #endif /* KMSAN */ /* * Acquire the domain lock and record contention. */ static uma_zone_domain_t zone_domain_lock(uma_zone_t zone, int domain) { uma_zone_domain_t zdom; bool lockfail; zdom = ZDOM_GET(zone, domain); lockfail = false; if (ZDOM_OWNED(zdom)) lockfail = true; ZDOM_LOCK(zdom); /* This is unsynchronized. The counter does not need to be precise. */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; return (zdom); } /* * Search for the domain with the least cached items and return it if it * is out of balance with the preferred domain. */ static __noinline int zone_domain_lowest(uma_zone_t zone, int pref) { long least, nitems, prefitems; int domain; int i; prefitems = least = LONG_MAX; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems < least) { domain = i; least = nitems; } if (domain == pref) prefitems = nitems; } if (prefitems < least * 2) return (pref); return (domain); } /* * Search for the domain with the most cached items and return it or the * preferred domain if it has enough to proceed. */ static __noinline int zone_domain_highest(uma_zone_t zone, int pref) { long most, nitems; int domain; int i; if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX) return (pref); most = 0; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems > most) { domain = i; most = nitems; } } return (domain); } /* * Set the maximum imax value. */ static void zone_domain_imax_set(uma_zone_domain_t zdom, int nitems) { long old; old = zdom->uzd_imax; do { if (old >= nitems) return; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); /* * We are at new maximum, so do the last WSS update for the old * bimin and prepare to measure next allocation batch. */ if (zdom->uzd_wss < old - zdom->uzd_bimin) zdom->uzd_wss = old - zdom->uzd_bimin; zdom->uzd_bimin = nitems; } /* * Attempt to satisfy an allocation by retrieving a full bucket from one of the * zone's caches. If a bucket is found the zone is not locked on return. */ static uma_bucket_t zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) { uma_bucket_t bucket; long cnt; int i; bool dtor = false; ZDOM_LOCK_ASSERT(zdom); if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL) return (NULL); /* SMR Buckets can not be re-used until readers expire. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { if (!smr_poll(zone->uz_smr, bucket->ub_seq, false)) return (NULL); bucket->ub_seq = SMR_SEQ_INVALID; dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR; if (STAILQ_NEXT(bucket, ub_link) != NULL) zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq; } STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link); KASSERT(zdom->uzd_nitems >= bucket->ub_cnt, ("%s: item count underflow (%ld, %d)", __func__, zdom->uzd_nitems, bucket->ub_cnt)); KASSERT(bucket->ub_cnt > 0, ("%s: empty bucket in bucket cache", __func__)); zdom->uzd_nitems -= bucket->ub_cnt; if (reclaim) { /* * Shift the bounds of the current WSS interval to avoid * perturbing the estimates. */ cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt); atomic_subtract_long(&zdom->uzd_imax, cnt); zdom->uzd_bimin -= cnt; zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); if (zdom->uzd_limin >= bucket->ub_cnt) { zdom->uzd_limin -= bucket->ub_cnt; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } } else if (zdom->uzd_bimin > zdom->uzd_nitems) { zdom->uzd_bimin = zdom->uzd_nitems; if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; } ZDOM_UNLOCK(zdom); if (dtor) for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); return (bucket); } /* * Insert a full bucket into the specified cache. The "ws" parameter indicates * whether the bucket's contents should be counted as part of the zone's working * set. The bucket may be freed if it exceeds the bucket limit. */ static void zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata, const bool ws) { uma_zone_domain_t zdom; /* We don't cache empty buckets. This can happen after a reclaim. */ if (bucket->ub_cnt == 0) goto out; zdom = zone_domain_lock(zone, domain); /* * Conditionally set the maximum number of items. */ zdom->uzd_nitems += bucket->ub_cnt; if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { if (ws) { zone_domain_imax_set(zdom, zdom->uzd_nitems); } else { /* * Shift the bounds of the current WSS interval to * avoid perturbing the estimates. */ atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt); zdom->uzd_imin += bucket->ub_cnt; zdom->uzd_bimin += bucket->ub_cnt; zdom->uzd_limin += bucket->ub_cnt; } if (STAILQ_EMPTY(&zdom->uzd_buckets)) zdom->uzd_seq = bucket->ub_seq; /* * Try to promote reuse of recently used items. For items * protected by SMR, try to defer reuse to minimize polling. */ if (bucket->ub_seq == SMR_SEQ_INVALID) STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); ZDOM_UNLOCK(zdom); return; } zdom->uzd_nitems -= bucket->ub_cnt; ZDOM_UNLOCK(zdom); out: bucket_free(zone, bucket, udata); } /* Pops an item out of a per-cpu cache bucket. */ static inline void * cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) { void *item; CRITICAL_ASSERT(curthread); bucket->ucb_cnt--; item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; #ifdef INVARIANTS bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); #endif cache->uc_allocs++; return (item); } /* Pushes an item into a per-cpu cache bucket. */ static inline void cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) { CRITICAL_ASSERT(curthread); KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; bucket->ucb_cnt++; cache->uc_frees++; } /* * Unload a UMA bucket from a per-cpu cache. */ static inline uma_bucket_t cache_bucket_unload(uma_cache_bucket_t bucket) { uma_bucket_t b; b = bucket->ucb_bucket; if (b != NULL) { MPASS(b->ub_entries == bucket->ucb_entries); b->ub_cnt = bucket->ucb_cnt; bucket->ucb_bucket = NULL; bucket->ucb_entries = bucket->ucb_cnt = 0; } return (b); } static inline uma_bucket_t cache_bucket_unload_alloc(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_allocbucket)); } static inline uma_bucket_t cache_bucket_unload_free(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_freebucket)); } static inline uma_bucket_t cache_bucket_unload_cross(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_crossbucket)); } /* * Load a bucket into a per-cpu cache bucket. */ static inline void cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) { CRITICAL_ASSERT(curthread); MPASS(bucket->ucb_bucket == NULL); MPASS(b->ub_seq == SMR_SEQ_INVALID); bucket->ucb_bucket = b; bucket->ucb_cnt = b->ub_cnt; bucket->ucb_entries = b->ub_entries; } static inline void cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_allocbucket, b); } static inline void cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_freebucket, b); } #ifdef NUMA static inline void cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_crossbucket, b); } #endif /* * Copy and preserve ucb_spare. */ static inline void cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { b1->ucb_bucket = b2->ucb_bucket; b1->ucb_entries = b2->ucb_entries; b1->ucb_cnt = b2->ucb_cnt; } /* * Swap two cache buckets. */ static inline void cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { struct uma_cache_bucket b3; CRITICAL_ASSERT(curthread); cache_bucket_copy(&b3, b1); cache_bucket_copy(b1, b2); cache_bucket_copy(b2, &b3); } /* * Attempt to fetch a bucket from a zone on behalf of the current cpu cache. */ static uma_bucket_t cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; smr_seq_t seq; /* * Avoid the lock if possible. */ zdom = ZDOM_GET(zone, domain); if (zdom->uzd_nitems == 0) return (NULL); if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 && (seq = atomic_load_32(&zdom->uzd_seq)) != SMR_SEQ_INVALID && !smr_poll(zone->uz_smr, seq, false)) return (NULL); /* * Check the zone's cache of buckets. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) return (bucket); ZDOM_UNLOCK(zdom); return (NULL); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static inline void zone_maxaction(uma_zone_t zone) { if (zone->uz_maxaction.ta_func != NULL) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *context __unused, int pending __unused) { bucket_enable(); zone_foreach(zone_timeout, NULL); /* Reschedule this event */ taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, UMA_TIMEOUT * hz); } /* * Update the working set size estimates for the zone's bucket cache. * The constants chosen here are somewhat arbitrary. */ static void zone_domain_update_wss(uma_zone_domain_t zdom) { long m; ZDOM_LOCK_ASSERT(zdom); MPASS(zdom->uzd_imax >= zdom->uzd_nitems); MPASS(zdom->uzd_nitems >= zdom->uzd_bimin); MPASS(zdom->uzd_bimin >= zdom->uzd_imin); /* * Estimate WSS as modified moving average of biggest allocation * batches for each period over few minutes (UMA_TIMEOUT of 20s). */ zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4, zdom->uzd_imax - zdom->uzd_bimin); /* * Estimate longtime minimum item count as a combination of recent * minimum item count, adjusted by WSS for safety, and the modified * moving average over the last several hours (UMA_TIMEOUT of 20s). * timin measures time since limin tried to go negative, that means * we were dangerously close to or got out of cache. */ m = zdom->uzd_imin - zdom->uzd_wss; if (m >= 0) { if (zdom->uzd_limin >= m) zdom->uzd_limin = m; else zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256; zdom->uzd_timin++; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } /* To reduce period edge effects on WSS keep half of the imax. */ atomic_subtract_long(&zdom->uzd_imax, (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2); zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems; } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void zone_timeout(uma_zone_t zone, void *unused) { uma_keg_t keg; u_int slabs, pages; if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) goto trim; keg = zone->uz_keg; /* * Hash zones are non-numa by definition so the first domain * is the only one present. */ KEG_LOCK(keg, 0); pages = keg->uk_domain[0].ud_pages; /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ KEG_UNLOCK(keg, 0); ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg, 0); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg, 0); hash_free(&oldhash); goto trim; } } KEG_UNLOCK(keg, 0); trim: /* Trim caches not used for a long time. */ if ((zone->uz_flags & UMA_ZONE_UNMANAGED) == 0) { for (int i = 0; i < vm_ndomains; i++) { if (bucket_cache_reclaim_domain(zone, false, false, i) && (zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, i); } } } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on success and 0 on failure. */ static int hash_alloc(struct uma_hash *hash, u_int size) { size_t alloc; KASSERT(powerof2(size), ("hash size must be power of 2")); if (size > UMA_HASH_SIZE_INIT) { hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_hash_slab_t slab; u_int hval; u_int idx; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (idx = 0; idx < oldhash->uh_hashsize; idx++) while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); LIST_REMOVE(slab, uhs_hlink); hval = UMA_HASH(newhash, slab->uhs_data); LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, uhs_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket->ub_cnt == 0) return; if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { smr_wait(zone->uz_smr, bucket->ub_seq); bucket->ub_seq = SMR_SEQ_INVALID; for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); } if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); } zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); if (zone->uz_max_items > 0) zone_free_limit(zone, bucket->ub_cnt); #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt); #endif bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being torn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t bucket; smr_seq_t seq; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket = cache_bucket_unload_alloc(cache); if (bucket != NULL) bucket_free(zone, bucket, NULL); bucket = cache_bucket_unload_free(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } bucket = cache_bucket_unload_cross(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } } bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN); } static void cache_shrink(uma_zone_t zone, void *unused) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; ZONE_LOCK(zone); zone->uz_bucket_size = (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; ZONE_UNLOCK(zone); } static void cache_drain_safe_cpu(uma_zone_t zone, void *unused) { uma_cache_t cache; uma_bucket_t b1, b2, b3; int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = b3 = NULL; critical_enter(); cache = &zone->uz_cpu[curcpu]; domain = PCPU_GET(domain); b1 = cache_bucket_unload_alloc(cache); /* * Don't flush SMR zone buckets. This leaves the zone without a * bucket and forces every free to synchronize(). */ if ((zone->uz_flags & UMA_ZONE_SMR) == 0) { b2 = cache_bucket_unload_free(cache); b3 = cache_bucket_unload_cross(cache); } critical_exit(); if (b1 != NULL) zone_free_bucket(zone, b1, NULL, domain, false); if (b2 != NULL) zone_free_bucket(zone, b2, NULL, domain, false); if (b3 != NULL) { /* Adjust the domain so it goes to zone_free_cross. */ domain = (domain + 1) % vm_ndomains; zone_free_bucket(zone, b3, NULL, domain, false); } } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enough, shrink aggressively. */ if (zone) cache_shrink(zone, NULL); else zone_foreach(cache_shrink, NULL); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone, NULL); else zone_foreach(cache_drain_safe_cpu, NULL); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller * requested a drain, otherwise the per-domain caches are trimmed to either * estimated working set size. */ static bool bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; long target; bool done = false; /* * The cross bucket is partially filled and not part of * the item count. Reclaim it individually here. */ zdom = ZDOM_GET(zone, domain); if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) { ZONE_CROSS_LOCK(zone); bucket = zdom->uzd_cross; zdom->uzd_cross = NULL; ZONE_CROSS_UNLOCK(zone); if (bucket != NULL) bucket_free(zone, bucket, NULL); } /* * If we were asked to drain the zone, we are done only once * this bucket cache is empty. If trim, we reclaim items in * excess of the zone's estimated working set size. Multiple * consecutive calls will shrink the WSS and so reclaim more. * If neither drain nor trim, then voluntarily reclaim 1/4 * (to reduce first spike) of items not used for a long time. */ ZDOM_LOCK(zdom); zone_domain_update_wss(zdom); if (drain) target = 0; else if (trim) target = zdom->uzd_wss; else if (zdom->uzd_timin > 900 / UMA_TIMEOUT) target = zdom->uzd_nitems - zdom->uzd_limin / 4; else { ZDOM_UNLOCK(zdom); return (done); } while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && zdom->uzd_nitems >= target + bucket->ub_cnt) { bucket = zone_fetch_bucket(zone, zdom, true); if (bucket == NULL) break; bucket_free(zone, bucket, NULL); done = true; ZDOM_LOCK(zdom); } ZDOM_UNLOCK(zdom); return (done); } static void bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain) { int i; /* * Shrink the zone bucket size to ensure that the per-CPU caches * don't grow too large. */ if (zone->uz_bucket_size > zone->uz_bucket_size_min) zone->uz_bucket_size--; if (domain != UMA_ANYDOMAIN && (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) { bucket_cache_reclaim_domain(zone, drain, true, domain); } else { for (i = 0; i < vm_ndomains; i++) bucket_cache_reclaim_domain(zone, drain, true, i); } } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; size_t size; int i; uint8_t flags; CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); mem = slab_data(slab, keg); size = PAGE_SIZE * keg->uk_ppera; kasan_mark_slab_valid(keg, mem); if (keg->uk_fini != NULL) { for (i = start - 1; i > -1; i--) #ifdef INVARIANTS /* * trash_fini implies that dtor was trash_dtor. trash_fini * would check that memory hasn't been modified since free, * which executed trash_dtor. * That's why we need to run uma_dbg_kskip() check here, * albeit we don't make skip check for other init/fini * invocations. */ if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || keg->uk_fini != trash_fini) #endif keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); } flags = slab->us_flags; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); } keg->uk_freef(mem, size, flags); uma_total_dec(size); } static void keg_drain_domain(uma_keg_t keg, int domain) { struct slabhead freeslabs; uma_domain_t dom; uma_slab_t slab, tmp; uint32_t i, stofree, stokeep, partial; dom = &keg->uk_domain[domain]; LIST_INIT(&freeslabs); CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", keg->uk_name, keg, domain, dom->ud_free_items); KEG_LOCK(keg, domain); /* * Are the free items in partially allocated slabs sufficient to meet * the reserve? If not, compute the number of fully free slabs that must * be kept. */ partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers; if (partial < keg->uk_reserve) { stokeep = min(dom->ud_free_slabs, howmany(keg->uk_reserve - partial, keg->uk_ipers)); } else { stokeep = 0; } stofree = dom->ud_free_slabs - stokeep; /* * Partition the free slabs into two sets: those that must be kept in * order to maintain the reserve, and those that may be released back to * the system. Since one set may be much larger than the other, * populate the smaller of the two sets and swap them if necessary. */ for (i = min(stofree, stokeep); i > 0; i--) { slab = LIST_FIRST(&dom->ud_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&freeslabs, slab, us_link); } if (stofree > stokeep) LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link); if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) { LIST_FOREACH(slab, &freeslabs, us_link) UMA_HASH_REMOVE(&keg->uk_hash, slab); } dom->ud_free_items -= stofree * keg->uk_ipers; dom->ud_free_slabs -= stofree; dom->ud_pages -= stofree * keg->uk_ppera; KEG_UNLOCK(keg, domain); LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp) keg_free_slab(keg, slab, keg->uk_ipers); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg, int domain) { int i; if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) return; if (domain != UMA_ANYDOMAIN) { keg_drain_domain(keg, domain); } else { for (i = 0; i < vm_ndomains; i++) keg_drain_domain(keg, i); } } static void zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain) { /* * Count active reclaim operations in order to interlock with * zone_dtor(), which removes the zone from global lists before * attempting to reclaim items itself. * * The zone may be destroyed while sleeping, so only zone_dtor() should * specify M_WAITOK. */ ZONE_LOCK(zone); if (waitok == M_WAITOK) { while (zone->uz_reclaimers > 0) msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1); } zone->uz_reclaimers++; ZONE_UNLOCK(zone); bucket_cache_reclaim(zone, drain, domain); if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, domain); ZONE_LOCK(zone); zone->uz_reclaimers--; if (zone->uz_reclaimers == 0) wakeup(zone); ZONE_UNLOCK(zone); } /* * Allocate a new slab for a keg and inserts it into the partial slab list. * The keg should be unlocked on entry. If the allocation succeeds it will * be locked on return. * * Arguments: * flags Wait flags for the item initialization routine * aflags Wait flags for the slab allocation * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, int aflags) { uma_domain_t dom; uma_slab_t slab; unsigned long size; uint8_t *mem; uint8_t sflags; int i; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { uma_hash_slab_t hslab; hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, domain, aflags); if (hslab == NULL) goto fail; slab = &hslab->uhs_slab; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) aflags |= M_ZERO; else aflags &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) aflags |= M_NODUMP; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; mem = keg->uk_allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); goto fail; } uma_total_inc(size); /* For HASH zones all pages go to the same uma_domain. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; kmsan_mark(mem, size, (aflags & M_ZERO) != 0 ? KMSAN_STATE_INITED : KMSAN_STATE_UNINIT); /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) slab = (uma_slab_t)(mem + keg->uk_pgoff); else slab_tohashslab(slab)->uhs_data = mem; if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), zone, slab); slab->us_freecount = keg->uk_ipers; slab->us_flags = sflags; slab->us_domain = domain; BIT_FILL(keg->uk_ipers, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); #endif if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab_item(slab, keg, i), keg->uk_size, flags) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); goto fail; } } kasan_mark_slab_invalid(keg, mem); KEG_LOCK(keg, domain); CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", slab, keg->uk_name, keg); if (keg->uk_flags & UMA_ZFLAG_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ dom = &keg->uk_domain[domain]; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); dom->ud_pages += keg->uk_ppera; dom->ud_free_items += keg->uk_ipers; return (slab); fail: return (NULL); } /* * This function is intended to be used early on in place of page_alloc(). It * performs contiguous physical memory allocations and uses a bump allocator for * KVA, so is usable before the kernel map is initialized. */ static void * startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vm_paddr_t pa; vm_page_t m; int i, pages; pages = howmany(bytes, PAGE_SIZE); KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); *pflag = UMA_SLAB_BOOT; m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); for (i = 0; i < pages; i++, pa += PAGE_SIZE) { #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif } /* Allocate KVA and indirectly advance bootmem. */ return ((void *)pmap_map(&bootmem, m->phys_addr, m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE)); } static void startup_free(void *mem, vm_size_t bytes) { vm_offset_t va; vm_page_t m; va = (vm_offset_t)mem; m = PHYS_TO_VM_PAGE(pmap_kextract(va)); /* * startup_alloc() returns direct-mapped slabs on some platforms. Avoid * unmapping ranges of the direct map. */ if (va >= bootstart && va + bytes <= bootmem) pmap_remove(kernel_pmap, va, va + bytes); for (; bytes != 0; bytes -= PAGE_SIZE, m++) { #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); return (p); } static void * pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { struct pglist alloctail; vm_offset_t addr, zkva; int cpu, flags; vm_page_t p, p_next; #ifdef NUMA struct pcpu *pc; #endif MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); TAILQ_INIT(&alloctail); flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | malloc2vm_flags(wait); *pflag = UMA_SLAB_KERNEL; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) { p = vm_page_alloc_noobj(flags); } else { #ifndef NUMA p = vm_page_alloc_noobj(flags); #else pc = pcpu_find(cpu); if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) p = NULL; else p = vm_page_alloc_noobj_domain(pc->pc_domain, flags); if (__predict_false(p == NULL)) p = vm_page_alloc_noobj(flags); #endif } if (__predict_false(p == NULL)) goto fail; TAILQ_INSERT_TAIL(&alloctail, p, listq); } if ((addr = kva_alloc(bytes)) == 0) goto fail; zkva = addr; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void*)addr); fail: TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } /* * Allocates a number of pages not belonging to a VM object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; int req; TAILQ_INIT(&alloctail); keg = zone->uz_keg; req = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED; if ((wait & M_WAITOK) != 0) req |= VM_ALLOC_WAITOK; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { p = vm_page_alloc_noobj_domain(domain, req); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Allocate physically contiguous pages. */ static void * contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, vm_size_t size, uint8_t flags) { if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } KASSERT((flags & UMA_SLAB_KERNEL) != 0, ("UMA: page_free used with invalid flags %x", flags)); kmem_free((vm_offset_t)mem, size); } /* * Frees pcpu zone allocations * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) { vm_offset_t sva, curva; vm_paddr_t paddr; vm_page_t m; MPASS(size == (mp_maxid+1)*PAGE_SIZE); if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } sva = (vm_offset_t)mem; for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { paddr = pmap_kextract(curva); m = PHYS_TO_VM_PAGE(paddr); vm_page_unwire_noq(m); vm_page_free(m); } pmap_qremove(sva, size >> PAGE_SHIFT); kva_free(sva, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } #ifdef INVARIANTS static struct noslabbits * slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) { return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); } #endif /* * Actual size of embedded struct slab (!OFFPAGE). */ static size_t slab_sizeof(int nitems) { size_t s; s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; return (roundup(s, UMA_ALIGN_PTR + 1)); } #define UMA_FIXPT_SHIFT 31 #define UMA_FRAC_FIXPT(n, d) \ ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) #define UMA_FIXPT_PCT(f) \ ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) /* * Compute the number of items that will fit in a slab. If hdr is true, the * item count may be limited to provide space in the slab for an inline slab * header. Otherwise, all slab space will be provided for item storage. */ static u_int slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) { u_int ipers; u_int padpi; /* The padding between items is not needed after the last item. */ padpi = rsize - size; if (hdr) { /* * Start with the maximum item count and remove items until * the slab header first alongside the allocatable memory. */ for (ipers = MIN(SLAB_MAX_SETSIZE, (slabsize + padpi - slab_sizeof(1)) / rsize); ipers > 0 && ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; ipers--) continue; } else { ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); } return (ipers); } struct keg_layout_result { u_int format; u_int slabsize; u_int ipers; u_int eff; }; static void keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt, struct keg_layout_result *kl) { u_int total; kl->format = fmt; kl->slabsize = slabsize; /* Handle INTERNAL as inline with an extra page. */ if ((fmt & UMA_ZFLAG_INTERNAL) != 0) { kl->format &= ~UMA_ZFLAG_INTERNAL; kl->slabsize += PAGE_SIZE; } kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize, (fmt & UMA_ZFLAG_OFFPAGE) == 0); /* Account for memory used by an offpage slab header. */ total = kl->slabsize; if ((fmt & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(kl->ipers)->uz_keg->uk_rsize; kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total); } /* * Determine the format of a uma keg. This determines where the slab header * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_layout(uma_keg_t keg) { struct keg_layout_result kl = {}, kl_tmp; u_int fmts[2]; u_int alignsize; u_int nfmt; u_int pages; u_int rsize; u_int slabsize; u_int i, j; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", __func__, keg->uk_name, keg->uk_size, keg->uk_flags, PRINT_UMA_ZFLAGS)); KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 || (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, PRINT_UMA_ZFLAGS)); alignsize = keg->uk_align + 1; #ifdef KASAN /* * ASAN requires that each allocation be aligned to the shadow map * scale factor. */ if (alignsize < KASAN_SHADOW_SCALE) alignsize = KASAN_SHADOW_SCALE; #endif /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); rsize = roundup2(rsize, alignsize); if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { /* * We want one item to start on every align boundary in a page. * To do this we will span pages. We will also extend the item * by the size of align if it is an even multiple of align. * Otherwise, it would fall on the same boundary every time. */ if ((rsize & alignsize) == 0) rsize += alignsize; slabsize = rsize * (PAGE_SIZE / alignsize); slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); slabsize = round_page(slabsize); } else { /* * Start with a slab size of as many pages as it takes to * represent a single item. We will try to fit as many * additional items into the slab as possible. */ slabsize = round_page(keg->uk_size); } /* Build a list of all of the available formats for this keg. */ nfmt = 0; /* Evaluate an inline slab layout. */ if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) fmts[nfmt++] = 0; /* TODO: vm_page-embedded slab. */ /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not want * to do if we're UMA_ZONE_VM, which clearly forbids it. * In those cases, evaluate a pseudo-format called INTERNAL * which has an inline slab header and one extra page to * guarantee that it fits. * * Otherwise, see if using an OFFPAGE slab will improve our * efficiency. */ if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0) fmts[nfmt++] = UMA_ZFLAG_INTERNAL; else fmts[nfmt++] = UMA_ZFLAG_OFFPAGE; /* * Choose a slab size and format which satisfy the minimum efficiency. * Prefer the smallest slab size that meets the constraints. * * Start with a minimum slab size, to accommodate CACHESPREAD. Then, * for small items (up to PAGE_SIZE), the iteration increment is one * page; and for large items, the increment is one item. */ i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize); KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u", keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize, rsize, i)); for ( ; ; i++) { slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) : round_page(rsize * (i - 1) + keg->uk_size); for (j = 0; j < nfmt; j++) { /* Only if we have no viable format yet. */ if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 && kl.ipers > 0) continue; keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp); if (kl_tmp.eff <= kl.eff) continue; kl = kl_tmp; CTR6(KTR_UMA, "keg %s layout: format %#x " "(ipers %u * rsize %u) / slabsize %#x = %u%% eff", keg->uk_name, kl.format, kl.ipers, rsize, kl.slabsize, UMA_FIXPT_PCT(kl.eff)); /* Stop when we reach the minimum efficiency. */ if (kl.eff >= UMA_MIN_EFF) break; } if (kl.eff >= UMA_MIN_EFF || !multipage_slabs || slabsize >= SLAB_MAX_SETSIZE * rsize || (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0) break; } pages = atop(kl.slabsize); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) pages *= mp_maxid + 1; keg->uk_rsize = rsize; keg->uk_ipers = kl.ipers; keg->uk_ppera = pages; keg->uk_flags |= kl.format; /* * How do we find the slab header if it is offpage or if not all item * start addresses are in the same page? We could solve the latter * case with vaddr alignment, but we don't. */ if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 || (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) { if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) keg->uk_flags |= UMA_ZFLAG_HASH; else keg->uk_flags |= UMA_ZFLAG_VTOSLAB; } CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers, pages); KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, keg->uk_ipers, pages)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; int i; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_reserve = 0; keg->uk_flags = arg->flags; /* * We use a global round-robin policy by default. Zones with * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which * case the iterator is never run. */ keg->uk_dr.dr_policy = DOMAINSET_RR(); keg->uk_dr.dr_iter = 0; /* * The primary zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZFLAG_VTOSLAB; #ifndef SMP keg->uk_flags &= ~UMA_ZONE_PCPU; #endif keg_layout(keg); /* * Use a first-touch NUMA policy for kegs that pmap_extract() will * work on. Use round-robin for everything else. * * Zones may override the default by specifying either. */ #ifdef NUMA if ((keg->uk_flags & (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0) keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) keg->uk_flags |= UMA_ZONE_ROUNDROBIN; #endif /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; else #endif if (booted < BOOT_KVA) keg->uk_allocf = startup_alloc; else if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_freef = uma_small_free; else #endif if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_freef = pcpu_page_free; else keg->uk_freef = page_free; /* * Initialize keg's locks. */ for (i = 0; i < vm_ndomains; i++) KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. See slab_sizeof * definition. */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { size_t shsize; shsize = slab_sizeof(keg->uk_ipers); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, ("zone %s ipers %d rsize %d size %d slab won't fit", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); } if (keg->uk_flags & UMA_ZFLAG_HASH) hash_alloc(&keg->uk_hash, 0); CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } static void zone_kva_available(uma_zone_t zone, void *unused) { uma_keg_t keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return; KEG_GET(zone, keg); if (keg->uk_allocf == startup_alloc) { /* Switch to the real allocator. */ if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; } } static void zone_alloc_counters(uma_zone_t zone, void *unused) { zone->uz_allocs = counter_u64_alloc(M_WAITOK); zone->uz_frees = counter_u64_alloc(M_WAITOK); zone->uz_fails = counter_u64_alloc(M_WAITOK); zone->uz_xdomain = counter_u64_alloc(M_WAITOK); } static void zone_alloc_sysctl(uma_zone_t zone, void *unused) { uma_zone_domain_t zdom; uma_domain_t dom; uma_keg_t keg; struct sysctl_oid *oid, *domainoid; int domains, i, cnt; static const char *nokeg = "cache zone"; char *c; /* * Make a sysctl safe copy of the zone name by removing * any special characters and handling dups by appending * an index. */ if (zone->uz_namecnt != 0) { /* Count the number of decimal digits and '_' separator. */ for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) cnt /= 10; zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, M_UMA, M_WAITOK); sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, zone->uz_namecnt); } else zone->uz_ctlname = strdup(zone->uz_name, M_UMA); for (c = zone->uz_ctlname; *c != '\0'; c++) if (strchr("./\\ -", *c) != NULL) *c = '_'; /* * Basic parameters at the root. */ zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); oid = zone->uz_oid; SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_flags, "A", "Allocator configuration flags"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, "Desired per-cpu cache size"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, "Maximum allowed per-cpu cache size"); /* * keg if present. */ if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) domains = vm_ndomains; else domains = 1; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); keg = zone->uz_keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, keg->uk_name, "Keg name"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, "Real object size with alignment"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, "pages per-slab allocation"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, "items available per-slab"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "align", CTLFLAG_RD, &keg->uk_align, 0, "item alignment mask"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "reserve", CTLFLAG_RD, &keg->uk_reserve, 0, "number of reserved items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, keg, 0, sysctl_handle_uma_slab_efficiency, "I", "Slab utilization (100 - internal fragmentation %)"); domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { dom = &keg->uk_domain[i]; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "pages", CTLFLAG_RD, &dom->ud_pages, 0, "Total pages currently allocated from VM"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_items", CTLFLAG_RD, &dom->ud_free_items, 0, "Items free in the slab layer"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0, "Unused slabs"); } } else SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, nokeg, "Keg name"); /* * Information about zone limits. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_items, "QU", "Current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of allocated and cached items"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, "Number of threads sleeping at limit"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0, "Maximum number of items in each domain's bucket cache"); /* * Per-domain zone information. */ domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { zdom = ZDOM_GET(zone, i); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "nitems", CTLFLAG_RD, &zdom->uzd_nitems, "number of items in this domain"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imax", CTLFLAG_RD, &zdom->uzd_imax, "maximum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imin", CTLFLAG_RD, &zdom->uzd_imin, "minimum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bimin", CTLFLAG_RD, &zdom->uzd_bimin, "Minimum item count in this batch"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wss", CTLFLAG_RD, &zdom->uzd_wss, "Working set size"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "limin", CTLFLAG_RD, &zdom->uzd_limin, "Long time minimum item count"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "timin", CTLFLAG_RD, &zdom->uzd_timin, 0, "Time since zero long time minimum item count"); } /* * General statistics. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, zone, 1, sysctl_handle_uma_zone_cur, "I", "Current number of allocated items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_allocs, "QU", "Total allocation calls"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_frees, "QU", "Total free calls"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "fails", CTLFLAG_RD, &zone->uz_fails, "Number of allocation failures"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "xdomain", CTLFLAG_RD, &zone->uz_xdomain, "Free calls from the wrong domain"); } struct uma_zone_count { const char *name; int count; }; static void zone_count(uma_zone_t zone, void *arg) { struct uma_zone_count *cnt; cnt = arg; /* * Some zones are rapidly created with identical names and * destroyed out of order. This can lead to gaps in the count. * Use one greater than the maximum observed for this name. */ if (strcmp(zone->uz_name, cnt->name) == 0) cnt->count = MAX(cnt->count, zone->uz_namecnt + 1); } static void zone_update_caches(uma_zone_t zone) { int i; for (i = 0; i <= mp_maxid; i++) { cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); } } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zone_count cnt; struct uma_zctor_args *arg = udata; uma_zone_domain_t zdom; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; int i; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_sleeps = 0; zone->uz_bucket_size = 0; zone->uz_bucket_size_min = 0; zone->uz_bucket_size_max = BUCKET_MAX; zone->uz_flags = (arg->flags & UMA_ZONE_SMR); zone->uz_warning = NULL; /* The domain structures follow the cpu structures. */ zone->uz_bucket_max = ULONG_MAX; timevalclear(&zone->uz_ratecheck); /* Count the number of duplicate names. */ cnt.name = arg->name; cnt.count = 0; zone_foreach(zone_count, &cnt); zone->uz_namecnt = cnt.count; ZONE_CROSS_LOCK_INIT(zone); for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(zone, i); ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS)); STAILQ_INIT(&zdom->uzd_buckets); } #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) if (arg->uminit == trash_init && arg->fini == trash_fini) zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; #elif defined(KASAN) if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0) arg->flags |= UMA_ZONE_NOKASAN; #endif /* * This is a pure cache zone, no kegs. */ if (arg->import) { KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, ("zone_ctor: Import specified for non-cache zone.")); zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; #ifdef NUMA /* * Cache zones are round-robin unless a policy is * specified because they may have incompatible * constraints. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) zone->uz_flags |= UMA_ZONE_ROUNDROBIN; #endif rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = zone_import; zone->uz_release = zone_release; zone->uz_arg = zone; keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = (arg->flags & ~UMA_ZONE_SMR); karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* Inherit properties from the keg. */ zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); out: if (booted >= BOOT_PCPU) { zone_alloc_counters(zone, NULL); if (booted >= BOOT_RUNNING) zone_alloc_sysctl(zone, NULL); } else { zone->uz_allocs = EARLY_COUNTER; zone->uz_frees = EARLY_COUNTER; zone->uz_fails = EARLY_COUNTER; } /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) zone->uz_smr = smr_create(zone->uz_name, 0, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), ("Invalid zone flag combination")); if (arg->flags & UMA_ZFLAG_INTERNAL) zone->uz_bucket_size_max = zone->uz_bucket_size = 0; if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) zone->uz_bucket_size = BUCKET_MAX; else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) zone->uz_bucket_size = 0; else zone->uz_bucket_size = bucket_select(zone->uz_size); zone->uz_bucket_size_min = zone->uz_bucket_size; if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) zone->uz_flags |= UMA_ZFLAG_CTORDTOR; zone_update_caches(zone); return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; uint32_t free, pages; int i; keg = (uma_keg_t)arg; free = pages = 0; for (i = 0; i < vm_ndomains; i++) { free += keg->uk_domain[i].ud_free_items; pages += keg->uk_domain[i].ud_pages; KEG_LOCK_FINI(keg, i); } if (pages != 0) printf("Freed UMA keg (%s) was not empty (%u items). " " Lost %u pages of memory.\n", keg->uk_name ? keg->uk_name : "", pages / keg->uk_ppera * keg->uk_ipers - free, pages); hash_free(&keg->uk_hash); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; int i; zone = (uma_zone_t)arg; sysctl_remove_oid(zone->uz_oid, 1, 1); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; keg->uk_reserve = 0; } zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. */ if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } counter_u64_free(zone->uz_allocs); counter_u64_free(zone->uz_frees); counter_u64_free(zone->uz_fails); counter_u64_free(zone->uz_xdomain); free(zone->uz_ctlname, M_UMA); for (i = 0; i < vm_ndomains; i++) ZDOM_LOCK_FINI(ZDOM_GET(zone, i)); ZONE_CROSS_LOCK_FINI(zone); } static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg) { uma_keg_t keg; uma_zone_t zone; LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone, arg); } LIST_FOREACH(zone, &uma_cachezones, uz_link) zfunc(zone, arg); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) { rw_rlock(&uma_rwlock); zone_foreach_unlocked(zfunc, arg); rw_runlock(&uma_rwlock); } /* * Initialize the kernel memory allocator. This is done after pages can be * allocated but before general KVA is available. */ void uma_startup1(vm_offset_t virtual_avail) { struct uma_zctor_args args; size_t ksize, zsize, size; uma_keg_t primarykeg; uintptr_t m; int domain; uint8_t pflag; bootstart = bootmem = virtual_avail; rw_init(&uma_rwlock, "UMA lock"); sx_init(&uma_reclaim_lock, "umareclaim"); ksize = sizeof(struct uma_keg) + (sizeof(struct uma_domain) * vm_ndomains); ksize = roundup(ksize, UMA_SUPER_ALIGN); zsize = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)) + (sizeof(struct uma_zone_domain) * vm_ndomains); zsize = roundup(zsize, UMA_SUPER_ALIGN); /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */ size = (zsize * 2) + ksize; for (domain = 0; domain < vm_ndomains; domain++) { m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag, M_NOWAIT | M_ZERO); if (m != 0) break; } zones = (uma_zone_t)m; m += zsize; kegs = (uma_zone_t)m; m += zsize; primarykeg = (uma_keg_t)m; /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = primarykeg; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(kegs, zsize, &args, M_WAITOK); args.name = "UMA Zones"; args.size = zsize; args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make zones for slab headers */ slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); smr_init(); } #ifndef UMA_MD_SMALL_ALLOC extern void vm_radix_reserve_kva(void); #endif /* * Advertise the availability of normal kva allocations and switch to * the default back-end allocator. Marks the KVA we consumed on startup * as used in the map. */ void uma_startup2(void) { if (bootstart != bootmem) { vm_map_lock(kernel_map); (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); vm_map_unlock(kernel_map); } #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); #endif booted = BOOT_KVA; zone_foreach_unlocked(zone_kva_available, NULL); bucket_enable(); } /* * Allocate counters as early as possible so that boot-time allocations are * accounted more precisely. */ static void uma_startup_pcpu(void *arg __unused) { zone_foreach_unlocked(zone_alloc_counters, NULL); booted = BOOT_PCPU; } SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL); /* * Finish our initialization steps. */ static void uma_startup3(void *arg __unused) { #ifdef INVARIANTS TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); uma_dbg_cnt = counter_u64_alloc(M_WAITOK); uma_skip_cnt = counter_u64_alloc(M_WAITOK); #endif zone_foreach_unlocked(zone_alloc_sysctl, NULL); booted = BOOT_RUNNING; EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); static void uma_startup4(void *arg __unused) { TIMEOUT_TASK_INIT(taskqueue_thread, &uma_timeout_task, 0, uma_timeout, NULL); taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, UMA_TIMEOUT * hz); } SYSINIT(uma_startup4, SI_SUB_TASKQ, SI_ORDER_ANY, uma_startup4, NULL); static void uma_shutdown(void) { booted = BOOT_SHUTDOWN; } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* Public functions */ /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", align, name)); /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) /* * Inject procedures which check for memory use after free if we are * allowed to scramble the memory while it is not allocated. This * requires that: UMA is actually able to access the memory, no init * or fini procedures, no dependency on the initial value of the * memory, and no (legitimate) use of the memory after free. Note, * the ctor and dtor do not need to be empty. */ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { args.uminit = trash_init; args.fini = trash_fini; } #endif args.align = align; args.flags = flags; args.keg = NULL; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; keg = primary->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags | UMA_ZFLAG_CACHE; return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { /* * Large slabs are expensive to reclaim, so don't bother doing * unnecessary work if we're shutting down. */ if (booted == BOOT_SHUTDOWN && zone->uz_fini == NULL && zone->uz_release == zone_release) return; sx_xlock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_xunlock(&uma_reclaim_lock); } void uma_zwait(uma_zone_t zone) { if ((zone->uz_flags & UMA_ZONE_SMR) != 0) uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK)); else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0) uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK)); else uma_zfree(zone, uma_zalloc(zone, M_WAITOK)); } void * uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) { void *item, *pcpu_item; #ifdef SMP int i; MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); if (item == NULL) return (NULL); pcpu_item = zpcpu_base_to_offset(item); if (flags & M_ZERO) { #ifdef SMP for (i = 0; i <= mp_maxid; i++) bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); #else bzero(item, zone->uz_size); #endif } return (pcpu_item); } /* * A stub while both regular and pcpu cases are identical. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata) { void *item; #ifdef SMP MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif /* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */ if (pcpu_item == NULL) return; item = zpcpu_offset_to_base(pcpu_item); uma_zfree_arg(zone, item, udata); } static inline void * item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags, void *item) { #ifdef INVARIANTS bool skipdbg; #endif kasan_mark_item_valid(zone, item); kmsan_mark_item_uninitialized(zone, item); #ifdef INVARIANTS skipdbg = uma_dbg_zskip(zone, item); if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_ctor != trash_ctor) trash_ctor(item, size, udata, flags); #endif /* Check flags before loading ctor pointer. */ if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) && __predict_false(zone->uz_ctor != NULL) && zone->uz_ctor(item, size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); return (NULL); } #ifdef INVARIANTS if (!skipdbg) uma_dbg_alloc(zone, NULL, item); #endif if (__predict_false(flags & M_ZERO)) return (memset(item, 0, size)); return (item); } static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS bool skipdbg; skipdbg = uma_dbg_zskip(zone, item); if (skip == SKIP_NONE && !skipdbg) { if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (__predict_true(skip < SKIP_DTOR)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, size, udata); #ifdef INVARIANTS if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_dtor != trash_dtor) trash_dtor(item, size, udata); #endif } kasan_mark_item_invalid(zone, item); } #ifdef NUMA static int item_domain(void *item) { int domain; domain = vm_phys_domain(vtophys(item)); KASSERT(domain >= 0 && domain < vm_ndomains, ("%s: unknown domain for item %p", __func__, item)); return (domain); } #endif #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS) #if defined(INVARIANTS) && (defined(DDB) || defined(STACK)) #include #endif #define UMA_ZALLOC_DEBUG static int uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags) { int error; error = 0; #ifdef WITNESS if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_debug: zone \"%s\"", zone->uz_name); } #endif #ifdef INVARIANTS KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_debug: called with M_EXEC")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_debug: called within spinlock or critical section")); KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0, ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO")); _Static_assert(M_NOWAIT != 0 && M_WAITOK != 0, "M_NOWAIT and M_WAITOK must be non-zero for this assertion:"); #if 0 /* * Give the #elif clause time to find problems, then remove it * and enable this. (Remove above, too.) */ KASSERT((flags & (M_NOWAIT|M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT|M_WAITOK)) == M_WAITOK, ("uma_zalloc_debug: must pass one of M_NOWAIT or M_WAITOK")); #elif defined(DDB) || defined(STACK) if (__predict_false((flags & (M_NOWAIT|M_WAITOK)) != M_NOWAIT && (flags & (M_NOWAIT|M_WAITOK)) != M_WAITOK)) { static int stack_count; struct stack st; if (stack_count < 10) { ++stack_count; printf("uma_zalloc* called with bad WAIT flags:\n"); stack_save(&st); stack_print(&st); } } #endif #endif #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) { void *item; item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { error = EJUSTRETURN; if (zone->uz_init != NULL && zone->uz_init(item, zone->uz_size, flags) != 0) { *itemp = NULL; return (error); } if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); *itemp = NULL; return (error); } *itemp = item; return (error); } /* This is unfortunate but should not be fatal. */ } #endif return (error); } static int uma_zfree_debug(uma_zone_t zone, void *item, void *udata) { KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zfree_debug: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return (EJUSTRETURN); } #endif return (0); } #endif static inline void * cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket, void *udata, int flags) { void *item; int size, uz_flags; item = cache_bucket_pop(cache, bucket); size = cache_uz_size(cache); uz_flags = cache_uz_flags(cache); critical_exit(); return (item_ctor(zone, uz_flags, size, udata, flags, item)); } static __noinline void * cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_cache_bucket_t bucket; int domain; while (cache_alloc(zone, cache, udata, flags)) { cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) continue; return (cache_alloc_item(zone, cache, bucket, udata, flags)); } critical_exit(); /* * We can not get a bucket so try to return a single item. */ if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; return (zone_alloc_item(zone, udata, domain, flags)); } /* See uma.h */ void * uma_zalloc_smr(uma_zone_t zone, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; CTR3(KTR_UMA, "uma_zalloc_smr zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zalloc_arg: called with non-SMR zone.")); if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN) return (item); #endif critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, NULL, flags)); return (cache_alloc_item(zone, cache, bucket, NULL, flags)); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_arg: called with SMR zone.")); if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, udata, flags)); return (cache_alloc_item(zone, cache, bucket, udata, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * * A false return value indicates an allocation failure. * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_bucket_t bucket; int curdomain, domain; bool new; CRITICAL_ASSERT(curthread); /* * If we have run out of items in our alloc bucket see * if we can switch with the free bucket. * * SMR Zones can't re-use the free bucket until the sequence has * expired. */ if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 && cache->uc_freebucket.ucb_cnt != 0) { cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); return (true); } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache_bucket_unload_alloc(cache); critical_exit(); if (bucket != NULL) { KASSERT(bucket->ub_cnt == 0, ("cache_alloc: Entered with non-empty alloc bucket.")); bucket_free(zone, bucket, udata); } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zdom lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ domain = PCPU_GET(domain); if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 || VM_DOMAIN_EMPTY(domain)) domain = zone_domain_highest(zone, domain); bucket = cache_fetch_bucket(zone, cache, domain); if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) { bucket = zone_alloc_bucket(zone, udata, domain, flags); new = true; } else { new = false; } CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket == NULL) { critical_enter(); return (false); } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 || (curdomain = PCPU_GET(domain)) == domain || VM_DOMAIN_EMPTY(curdomain))) { if (new) atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax, bucket->ub_cnt); cache_bucket_load_alloc(cache, bucket); return (true); } /* * We lost the race, release this bucket and start over. */ critical_exit(); zone_put_bucket(zone, domain, bucket, udata, !new); critical_enter(); return (true); } void * uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { #ifdef NUMA uma_bucket_t bucket; uma_zone_domain_t zdom; void *item; #endif /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", zone->uz_name, zone, domain, flags); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_domain: called with SMR zone.")); #ifdef NUMA KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0, ("uma_zalloc_domain: called with non-FIRSTTOUCH zone.")); if (vm_ndomains == 1) return (uma_zalloc_arg(zone, udata, flags)); #ifdef UMA_ZALLOC_DEBUG if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * Try to allocate from the bucket cache before falling back to the keg. * We could try harder and attempt to allocate from per-CPU caches or * the per-domain cross-domain buckets, but the complexity is probably * not worth it. It is more important that frees of previous * cross-domain allocations do not blow up the cache. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt - 1] = NULL; #endif bucket->ub_cnt--; zone_put_bucket(zone, domain, bucket, udata, true); item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item != NULL) { KASSERT(item_domain(item) == domain, ("%s: bucket cache item %p from wrong domain", __func__, item)); counter_u64_add(zone->uz_allocs, 1); } return (item); } ZDOM_UNLOCK(zdom); return (zone_alloc_item(zone, udata, domain, flags)); #else return (uma_zalloc_arg(zone, udata, flags)); #endif } /* * Find a slab with some space. Prefer slabs that are partially used over those * that are totally full. This helps to reduce fragmentation. * * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check * only 'domain'. */ static uma_slab_t keg_first_slab(uma_keg_t keg, int domain, bool rr) { uma_domain_t dom; uma_slab_t slab; int start; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); KEG_LOCK_ASSERT(keg, domain); slab = NULL; start = domain; do { dom = &keg->uk_domain[domain]; if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL) return (slab); if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) { LIST_REMOVE(slab, us_link); dom->ud_free_slabs--; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } if (rr) domain = (domain + 1) % vm_ndomains; } while (domain != start); return (NULL); } /* * Fetch an existing slab from a free or partial list. Returns with the * keg domain lock held if a slab was found or unlocked if not. */ static uma_slab_t keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) { uma_slab_t slab; uint32_t reserve; /* HASH has a single free list. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; KEG_LOCK(keg, domain); reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; if (keg->uk_domain[domain].ud_free_items <= reserve || (slab = keg_first_slab(keg, domain, rr)) == NULL) { KEG_UNLOCK(keg, domain); return (NULL); } return (slab); } static uma_slab_t keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) { struct vm_domainset_iter di; uma_slab_t slab; int aflags, domain; bool rr; KASSERT((flags & (M_WAITOK | M_NOVM)) != (M_WAITOK | M_NOVM), ("%s: invalid flags %#x", __func__, flags)); restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). * * To avoid races we run the iterator with the keg lock held, but that * means that we cannot allow the vm_domainset layer to sleep. Thus, * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); } else { aflags = flags; domain = rdomain; } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); if (slab != NULL) return (slab); /* * M_NOVM is used to break the recursion that can otherwise * occur if low-level memory management routines use UMA. */ if ((flags & M_NOVM) == 0) { slab = keg_alloc_slab(keg, zone, domain, flags, aflags); if (slab != NULL) return (slab); } if (!rr) { if ((flags & M_USE_RESERVE) != 0) { /* * Drain reserves from other domains before * giving up or sleeping. It may be useful to * support per-domain reserves eventually. */ rdomain = UMA_ANYDOMAIN; goto restart; } if ((flags & M_WAITOK) == 0) break; vm_wait_domain(domain); } else if (vm_domainset_iter_policy(&di, &domain) != 0) { if ((flags & M_WAITOK) != 0) { vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); goto restart; } break; } } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) return (slab); return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { uma_domain_t dom; void *item; int freei; KEG_LOCK_ASSERT(keg, slab->us_domain); dom = &keg->uk_domain[slab->us_domain]; freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; BIT_CLR(keg->uk_ipers, freei, &slab->us_free); item = slab_item(slab, keg, freei); slab->us_freecount--; dom->ud_free_items--; /* * Move this slab to the full list. It must be on the partial list, so * we do not need to update the free slab count. In particular, * keg_fetch_slab() always returns slabs on the partial list. */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int zone_import(void *arg, void **bucket, int max, int domain, int flags) { uma_domain_t dom; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; #ifdef NUMA int stripe; #endif int i; zone = arg; slab = NULL; keg = zone->uz_keg; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) break; #ifdef NUMA stripe = howmany(max, vm_ndomains); #endif dom = &keg->uk_domain[slab->us_domain]; do { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_reserve > 0 && dom->ud_free_items <= keg->uk_reserve) { /* * Avoid depleting the reserve after a * successful item allocation, even if * M_USE_RESERVE is specified. */ KEG_UNLOCK(keg, slab->us_domain); goto out; } #ifdef NUMA /* * If the zone is striped we pick a new slab for every * N allocations. Eliminating this conditional will * instead pick a new domain for each bucket rather * than stripe within each bucket. The current option * produces more fragmentation and requires more cpu * time but yields better distribution. */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && vm_ndomains > 1 && --stripe == 0) break; #endif } while (slab->us_freecount != 0 && i < max); KEG_UNLOCK(keg, slab->us_domain); /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } out: return i; } static int zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) { uint64_t old, new, total, max; /* * The hard case. We're going to sleep because there were existing * sleepers or because we ran out of items. This routine enforces * fairness by keeping fifo order. * * First release our ill gotten gains and make some noise. */ for (;;) { zone_free_limit(zone, count); zone_log_warning(zone); zone_maxaction(zone); if (flags & M_NOWAIT) return (0); /* * We need to allocate an item or set ourself as a sleeper * while the sleepq lock is held to avoid wakeup races. This * is essentially a home rolled semaphore. */ sleepq_lock(&zone->uz_max_items); old = zone->uz_items; do { MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); /* Cache the max since we will evaluate twice. */ max = zone->uz_max_items; if (UZ_ITEMS_SLEEPERS(old) != 0 || UZ_ITEMS_COUNT(old) >= max) new = old + UZ_ITEMS_SLEEPER; else new = old + MIN(count, max - old); } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); /* We may have successfully allocated under the sleepq lock. */ if (UZ_ITEMS_SLEEPERS(new) == 0) { sleepq_release(&zone->uz_max_items); return (new - old); } /* * This is in a different cacheline from uz_items so that we * don't constantly invalidate the fastpath cacheline when we * adjust item counts. This could be limited to toggling on * transitions. */ atomic_add_32(&zone->uz_sleepers, 1); atomic_add_64(&zone->uz_sleeps, 1); /* * We have added ourselves as a sleeper. The sleepq lock * protects us from wakeup races. Sleep now and then retry. */ sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); sleepq_wait(&zone->uz_max_items, PVM); /* * After wakeup, remove ourselves as a sleeper and try * again. We no longer have the sleepq lock for protection. * * Subract ourselves as a sleeper while attempting to add * our count. */ atomic_subtract_32(&zone->uz_sleepers, 1); old = atomic_fetchadd_64(&zone->uz_items, -(UZ_ITEMS_SLEEPER - count)); /* We're no longer a sleeper. */ old -= UZ_ITEMS_SLEEPER; /* * If we're still at the limit, restart. Notably do not * block on other sleepers. Cache the max value to protect * against changes via sysctl. */ total = UZ_ITEMS_COUNT(old); max = zone->uz_max_items; if (total >= max) continue; /* Truncate if necessary, otherwise wake other sleepers. */ if (total + count > max) { zone_free_limit(zone, total + count - max); count = max - total; } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) wakeup_one(&zone->uz_max_items); return (count); } } /* * Allocate 'count' items from our max_items limit. Returns the number * available. If M_NOWAIT is not specified it will sleep until at least * one item can be allocated. */ static int zone_alloc_limit(uma_zone_t zone, int count, int flags) { uint64_t old; uint64_t max; max = zone->uz_max_items; MPASS(max > 0); /* * We expect normal allocations to succeed with a simple * fetchadd. */ old = atomic_fetchadd_64(&zone->uz_items, count); if (__predict_true(old + count <= max)) return (count); /* * If we had some items and no sleepers just return the * truncated value. We have to release the excess space * though because that may wake sleepers who weren't woken * because we were temporarily over the limit. */ if (old < max) { zone_free_limit(zone, (old + count) - max); return (max - old); } return (zone_alloc_limit_hard(zone, count, flags)); } /* * Free a number of items back to the limit. */ static void zone_free_limit(uma_zone_t zone, int count) { uint64_t old; MPASS(count > 0); /* * In the common case we either have no sleepers or * are still over the limit and can just return. */ old = atomic_fetchadd_64(&zone->uz_items, -count); if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) return; /* * Moderate the rate of wakeups. Sleepers will continue * to generate wakeups if necessary. */ wakeup_one(&zone->uz_max_items); } static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int error, maxbucket, cnt; CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, zone, domain); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) domain = UMA_ANYDOMAIN; if (zone->uz_max_items > 0) maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, M_NOWAIT); else maxbucket = zone->uz_bucket_size; if (maxbucket == 0) return (NULL); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) { cnt = 0; goto out; } bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, MIN(maxbucket, bucket->ub_entries), domain, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); error = zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); if (error != 0) break; } /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } cnt = bucket->ub_cnt; if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); counter_u64_add(zone->uz_fails, 1); bucket = NULL; } out: if (zone->uz_max_items > 0 && cnt < maxbucket) zone_free_limit(zone, maxbucket - cnt); return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { counter_u64_add(zone->uz_fails, 1); return (NULL); } /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail_cnt; /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { int error; kasan_mark_item_valid(zone, item); error = zone->uz_init(item, zone->uz_size, flags); kasan_mark_item_invalid(zone, item); if (error != 0) { zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); goto fail_cnt; } } item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item == NULL) goto fail; counter_u64_add(zone->uz_allocs, 1); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: if (zone->uz_max_items > 0) zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); return (NULL); } /* See uma.h */ void uma_zfree_smr(uma_zone_t zone, void *item) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain; #ifdef NUMA int uz_flags; #endif CTR3(KTR_UMA, "uma_zfree_smr zone %s(%p) item %p", zone->uz_name, zone, item); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zfree_smr: called with non-SMR zone.")); KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer.")); SMR_ASSERT_NOT_ENTERED(zone->uz_smr); if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN) return; #endif cache = &zone->uz_cpu[curcpu]; itemdomain = 0; #ifdef NUMA uz_flags = cache_uz_flags(cache); if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* SMR Zones must free to the free bucket. */ bucket = &cache->uc_freebucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } #endif if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, NULL, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zone_free_item(zone, item, NULL, SKIP_NONE); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); CTR3(KTR_UMA, "uma_zfree_arg zone %s(%p) item %p", zone->uz_name, zone, item); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zfree_arg: called with SMR zone.")); if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN) return; #endif /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; /* * We are accessing the per-cpu cache without a critical section to * fetch size and flags. This is acceptable, if we are preempted we * will simply read another cpu's line. */ cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); if (UMA_ALWAYS_CTORDTOR || __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0)) item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { if (atomic_load_32(&zone->uz_sleepers) > 0) goto zfree_item; } /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* * Try to free into the allocbucket first to give LIFO * ordering for cache-hot datastructures. Spill over * into the freebucket if necessary. Alloc will swap * them if one runs dry. */ bucket = &cache->uc_allocbucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } else #endif if (bucket->ucb_cnt == bucket->ucb_entries && cache->uc_freebucket.ucb_cnt < cache->uc_freebucket.ucb_entries) cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, udata, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); } #ifdef NUMA /* * sort crossdomain free buckets to domain correct buckets and cache * them. */ static void zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucketlist emptybuckets, fullbuckets; uma_zone_domain_t zdom; uma_bucket_t b; smr_seq_t seq; void *item; int domain; CTR3(KTR_UMA, "uma_zfree: zone %s(%p) draining cross bucket %p", zone->uz_name, zone, bucket); /* * It is possible for buckets to arrive here out of order so we fetch * the current smr seq rather than accepting the bucket's. */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); /* * To avoid having ndomain * ndomain buckets for sorting we have a * lock on the current crossfree bucket. A full matrix with * per-domain locking could be used if necessary. */ STAILQ_INIT(&emptybuckets); STAILQ_INIT(&fullbuckets); ZONE_CROSS_LOCK(zone); for (; bucket->ub_cnt > 0; bucket->ub_cnt--) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; domain = item_domain(item); zdom = ZDOM_GET(zone, domain); if (zdom->uzd_cross == NULL) { if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } else { /* * Avoid allocating a bucket with the cross lock * held, since allocation can trigger a * cross-domain free and bucket zones may * allocate from each other. */ ZONE_CROSS_UNLOCK(zone); b = bucket_alloc(zone, udata, M_NOWAIT); if (b == NULL) goto out; ZONE_CROSS_LOCK(zone); if (zdom->uzd_cross != NULL) { STAILQ_INSERT_HEAD(&emptybuckets, b, ub_link); } else { zdom->uzd_cross = b; } } } b = zdom->uzd_cross; b->ub_bucket[b->ub_cnt++] = item; b->ub_seq = seq; if (b->ub_cnt == b->ub_entries) { STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link); if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } } ZONE_CROSS_UNLOCK(zone); out: if (bucket->ub_cnt == 0) bucket->ub_seq = SMR_SEQ_INVALID; bucket_free(zone, bucket, udata); while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); bucket_free(zone, b, udata); } while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) { STAILQ_REMOVE_HEAD(&fullbuckets, ub_link); domain = item_domain(b->ub_bucket[0]); zone_put_bucket(zone, domain, b, udata, true); } } #endif static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws) { #ifdef NUMA /* * Buckets coming from the wrong domain will be entirely for the * only other domain on two domain systems. In this case we can * simply cache them. Otherwise we need to sort them back to * correct domains. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) { zone_free_cross(zone, bucket, udata); return; } #endif /* * Attempt to save the bucket in the zone's domain bucket cache. */ CTR3(KTR_UMA, "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) itemdomain = zone_domain_lowest(zone, itemdomain); zone_put_bucket(zone, itemdomain, bucket, udata, ws); } /* * Populate a free or cross bucket for the current cpu cache. Free any * existing full bucket either to the zone cache or back to the slab layer. * * Enters and returns in a critical section. false return indicates that * we can not satisfy this free in the cache layer. true indicates that * the caller should retry. */ static __noinline bool cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, int itemdomain) { uma_cache_bucket_t cbucket; uma_bucket_t newbucket, bucket; CRITICAL_ASSERT(curthread); if (zone->uz_bucket_size == 0) return false; cache = &zone->uz_cpu[curcpu]; newbucket = NULL; /* * FIRSTTOUCH domains need to free to the correct zdom. When * enabled this is the zdom of the item. The bucket is the * cross bucket if the current domain and itemdomain do not match. */ cbucket = &cache->uc_freebucket; #ifdef NUMA if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain) { cbucket = &cache->uc_crossbucket; if (cbucket->ucb_cnt != 0) counter_u64_add(zone->uz_xdomain, cbucket->ucb_cnt); } } #endif bucket = cache_bucket_unload(cbucket); KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries, ("cache_free: Entered with non-full free bucket.")); /* We are no longer associated with this CPU. */ critical_exit(); /* * Don't let SMR zones operate without a free bucket. Force * a synchronize and re-use this one. We will only degrade * to a synchronize every bucket_size items rather than every * item if we fail to allocate a bucket. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) { if (bucket != NULL) bucket->ub_seq = smr_advance(zone->uz_smr); newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (newbucket == NULL && bucket != NULL) { bucket_drain(zone, bucket); newbucket = bucket; bucket = NULL; } } else if (!bucketdisable) newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (bucket != NULL) zone_free_bucket(zone, bucket, udata, itemdomain, true); critical_enter(); if ((bucket = newbucket) == NULL) return (false); cache = &zone->uz_cpu[curcpu]; #ifdef NUMA /* * Check to see if we should be populating the cross bucket. If it * is already populated we will fall through and attempt to populate * the free bucket. */ if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain && cache->uc_crossbucket.ucb_bucket == NULL) { cache_bucket_load_cross(cache, bucket); return (true); } } #endif /* * We may have lost the race to fill the bucket or switched CPUs. */ if (cache->uc_freebucket.ucb_bucket != NULL) { critical_exit(); bucket_free(zone, bucket, udata); critical_enter(); } else cache_bucket_load_free(cache, bucket); return (true); } static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; uma_domain_t dom; int freei; keg = zone->uz_keg; KEG_LOCK_ASSERT(keg, slab->us_domain); /* Do we need to remove from any lists? */ dom = &keg->uk_domain[slab->us_domain]; if (slab->us_freecount + 1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ freei = slab_item_index(slab, keg, item); BIT_SET(keg->uk_ipers, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ dom->ud_free_items++; } static void zone_release(void *arg, void **bucket, int cnt) { struct mtx *lock; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; void *item; int i; zone = arg; keg = zone->uz_keg; lock = NULL; if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) lock = KEG_LOCK(keg, 0); for (i = 0; i < cnt; i++) { item = bucket[i]; if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { slab = vtoslab((vm_offset_t)item); } else { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) slab = hash_sfind(&keg->uk_hash, mem); else slab = (uma_slab_t)(mem + keg->uk_pgoff); } if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { if (lock != NULL) mtx_unlock(lock); lock = KEG_LOCK(keg, slab->us_domain); } slab_free_item(zone, slab, item); } if (lock != NULL) mtx_unlock(lock); } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static __noinline void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { /* * If a free is sent directly to an SMR zone we have to * synchronize immediately because the item can instantly * be reallocated. This should only happen in degenerate * cases when no memory is available for per-cpu caches. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE) smr_synchronize(zone->uz_smr); item_dtor(zone, item, zone->uz_size, udata, skip); if (skip < SKIP_FINI && zone->uz_fini) { kasan_mark_item_valid(zone, item); zone->uz_fini(item, zone->uz_size); kasan_mark_item_invalid(zone, item); } zone->uz_release(zone->uz_arg, &item, 1); if (skip & SKIP_CNT) return; counter_u64_add(zone->uz_frees, 1); if (zone->uz_max_items > 0) zone_free_limit(zone, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { /* * If the limit is small, we may need to constrain the maximum per-CPU * cache size, or disable caching entirely. */ uma_zone_set_maxcache(zone, nitems); /* * XXX This can misbehave if the zone has any allocations with * no limit and a limit is imposed. There is currently no * way to clear a limit. */ ZONE_LOCK(zone); if (zone->uz_max_items == 0) ZONE_ASSERT_COLD(zone); zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_caches(zone); /* We may need to wake waiters. */ wakeup(&zone->uz_max_items); ZONE_UNLOCK(zone); return (nitems); } /* See uma.h */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems) { int bpcpu, bpdom, bsize, nb; ZONE_LOCK(zone); /* * Compute a lower bound on the number of items that may be cached in * the zone. Each CPU gets at least two buckets, and for cross-domain * frees we use an additional bucket per CPU and per domain. Select the * largest bucket size that does not exceed half of the requested limit, * with the left over space given to the full bucket cache. */ bpdom = 0; bpcpu = 2; #ifdef NUMA if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) { bpcpu++; bpdom++; } #endif nb = bpcpu * mp_ncpus + bpdom * vm_ndomains; bsize = nitems / nb / 2; if (bsize > BUCKET_MAX) bsize = BUCKET_MAX; else if (bsize == 0 && nitems / nb > 0) bsize = 1; zone->uz_bucket_size_max = zone->uz_bucket_size = bsize; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; zone->uz_bucket_max = nitems - nb * bsize; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; nitems = atomic_load_64(&zone->uz_max_items); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_ASSERT_COLD(zone); zone->uz_warning = warning; } /* See uma.h */ void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) { ZONE_ASSERT_COLD(zone); TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs) - counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems < 0 ? 0 : nitems); } static uint64_t uma_zone_get_allocs(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); return (nitems); } static uint64_t uma_zone_get_frees(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems); } #ifdef INVARIANTS /* Used only for KEG_ASSERT_COLD(). */ static uint64_t uma_keg_get_allocs(uma_keg_t keg) { uma_zone_t z; uint64_t nitems; nitems = 0; LIST_FOREACH(z, &keg->uk_zones, uz_link) nitems += uma_zone_get_allocs(z); return (nitems); } #endif /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_init = uminit; } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_fini = fini; } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_ASSERT_COLD(zone); zone->uz_init = zinit; } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_ASSERT_COLD(zone); zone->uz_fini = zfini; } /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_freef = freef; } /* See uma.h */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_allocf = allocf; } /* See uma.h */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr) { ZONE_ASSERT_COLD(zone); KASSERT(smr != NULL, ("Got NULL smr")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("zone %p (%s) already uses SMR", zone, zone->uz_name)); zone->uz_flags |= UMA_ZONE_SMR; zone->uz_smr = smr; zone_update_caches(zone); } smr_t uma_zone_get_smr(uma_zone_t zone) { return (zone->uz_smr); } /* See uma.h */ void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_reserve = items; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; u_int pages; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); ZONE_ASSERT_COLD(zone); pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); if (kva == 0) return (0); } else kva = 0; MPASS(keg->uk_kva == 0); keg->uk_kva = kva; keg->uk_offset = 0; zone->uz_max_items = pages * keg->uk_ipers; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone_update_caches(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { struct vm_domainset_iter di; uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; int aflags, domain, slabs; KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); if (slab != NULL) { dom = &keg->uk_domain[slab->us_domain]; /* * keg_alloc_slab() always returns a slab on the * partial list. */ LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; KEG_UNLOCK(keg, slab->us_domain); break; } if (vm_domainset_iter_policy(&di, &domain) != 0) vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); } } } /* * Returns a snapshot of memory consumption in bytes. */ size_t uma_zone_memory(uma_zone_t zone) { size_t sz; int i; sz = 0; if (zone->uz_flags & UMA_ZFLAG_CACHE) { for (i = 0; i < vm_ndomains; i++) sz += ZDOM_GET(zone, i)->uzd_nitems; return (sz * zone->uz_size); } for (i = 0; i < vm_ndomains; i++) sz += zone->uz_keg->uk_domain[i].ud_pages; return (sz * PAGE_SIZE); } struct uma_reclaim_args { int domain; int req; }; static void uma_reclaim_domain_cb(uma_zone_t zone, void *arg) { struct uma_reclaim_args *args; args = arg; if ((zone->uz_flags & UMA_ZONE_UNMANAGED) == 0) uma_zone_reclaim_domain(zone, args->req, args->domain); } /* See uma.h */ void uma_reclaim(int req) { uma_reclaim_domain(req, UMA_ANYDOMAIN); } void uma_reclaim_domain(int req, int domain) { struct uma_reclaim_args args; bucket_enable(); args.domain = domain; args.req = req; sx_slock(&uma_reclaim_lock); switch (req) { case UMA_RECLAIM_TRIM: case UMA_RECLAIM_DRAIN: zone_foreach(uma_reclaim_domain_cb, &args); break; case UMA_RECLAIM_DRAIN_CPU: zone_foreach(uma_reclaim_domain_cb, &args); pcpu_cache_drain_safe(NULL); zone_foreach(uma_reclaim_domain_cb, &args); break; default: panic("unhandled reclamation request %d", req); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ uma_zone_reclaim_domain(slabzones[0], UMA_RECLAIM_DRAIN, domain); uma_zone_reclaim_domain(slabzones[1], UMA_RECLAIM_DRAIN, domain); bucket_zone_drain(domain); sx_sunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; void uma_reclaim_wakeup(void) { if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) wakeup(uma_reclaim); } void uma_reclaim_worker(void *arg __unused) { for (;;) { sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } /* See uma.h */ void uma_zone_reclaim(uma_zone_t zone, int req) { uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN); } void uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain) { switch (req) { case UMA_RECLAIM_TRIM: zone_reclaim(zone, domain, M_NOWAIT, false); break; case UMA_RECLAIM_DRAIN: zone_reclaim(zone, domain, M_NOWAIT, true); break; case UMA_RECLAIM_DRAIN_CPU: pcpu_cache_drain_safe(zone); zone_reclaim(zone, domain, M_NOWAIT, true); break; default: panic("unhandled reclamation request %d", req); } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { return (atomic_load_32(&zone->uz_sleepers) > 0); } unsigned long uma_limit(void) { return (uma_kmem_limit); } void uma_set_limit(unsigned long limit) { uma_kmem_limit = limit; } unsigned long uma_size(void) { return (atomic_load_long(&uma_kmem_total)); } long uma_avail(void) { return (uma_kmem_limit - uma_size()); } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * */ static void uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) { uma_cache_t cache; uint64_t allocs, frees, sleeps, xdomain; int cachefree, cpu; allocs = frees = sleeps = xdomain = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; cachefree += cache->uc_allocbucket.ucb_cnt; cachefree += cache->uc_freebucket.ucb_cnt; xdomain += cache->uc_crossbucket.ucb_cnt; cachefree += cache->uc_crossbucket.ucb_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += counter_u64_fetch(z->uz_allocs); frees += counter_u64_fetch(z->uz_frees); xdomain += counter_u64_fetch(z->uz_xdomain); sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; if (xdomainp != NULL) *xdomainp = xdomain; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static void uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, struct uma_percpu_stat *ups, bool internal) { uma_zone_domain_t zdom; uma_cache_t cache; int i; for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(z, i); uth->uth_zone_free += zdom->uzd_nitems; } uth->uth_allocs = counter_u64_fetch(z->uz_allocs); uth->uth_frees = counter_u64_fetch(z->uz_frees); uth->uth_fails = counter_u64_fetch(z->uz_fails); uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain); uth->uth_sleeps = z->uz_sleeps; for (i = 0; i < mp_maxid + 1; i++) { bzero(&ups[i], sizeof(*ups)); if (internal || CPU_ABSENT(i)) continue; cache = &z->uz_cpu[i]; ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; ups[i].ups_allocs = cache->uc_allocs; ups[i].ups_frees = cache->uc_frees; } } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat *ups; struct sbuf sbuf; uma_keg_t kz; uma_zone_t z; uint64_t items; uint32_t kfree, pages; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { kfree = pages = 0; for (i = 0; i < vm_ndomains; i++) { kfree += kz->uk_domain[i].ud_free_items; pages += kz->uk_domain[i].ud_pages; } LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; if (z->uz_max_items > 0) { items = UZ_ITEMS_COUNT(z->uz_items); uth.uth_pages = (items / kz->uk_ipers) * kz->uk_ppera; } else uth.uth_pages = pages; uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * kz->uk_ppera; uth.uth_limit = z->uz_max_items; uth.uth_keg_free = kfree; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; uma_vm_zone_stats(&uth, z, &sbuf, ups, kz->uk_flags & UMA_ZFLAG_INTERNAL); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } LIST_FOREACH(z, &uma_cachezones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_size = z->uz_size; uma_vm_zone_stats(&uth, z, &sbuf, ups, false); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); free(ups, M_TEMP); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max; max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone; int cur; /* * Some callers want to add sysctls for global zones that * may not yet exist so they pass a pointer to a pointer. */ if (arg2 == 0) zone = *(uma_zone_t *)arg1; else zone = arg1; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_allocs(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_frees(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; uma_zone_t zone = arg1; int error; sbuf_new_for_sysctl(&sbuf, NULL, 0, req); if (zone->uz_flags != 0) sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); else sbuf_printf(&sbuf, "0"); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) { uma_keg_t keg = arg1; int avail, effpct, total; total = keg->uk_ppera * PAGE_SIZE; if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; /* * We consider the client's requested size and alignment here, not the * real size determination uk_rsize, because we also adjust the real * size for internal implementation reasons (max bitset size). */ avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) avail *= mp_maxid + 1; effpct = 100 * avail / total; return (sysctl_handle_int(oidp, &effpct, 0, req)); } static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); return (sysctl_handle_64(oidp, &cur, 0, req)); } #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; uma_keg_t keg; uint8_t *mem; /* * It is safe to return the slab here even though the * zone is unlocked because the item's allocation state * essentially holds a reference. */ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (NULL); if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) return (vtoslab((vm_offset_t)mem)); keg = zone->uz_keg; if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) return ((uma_slab_t)(mem + keg->uk_pgoff)); KEG_LOCK(keg, 0); slab = hash_sfind(&keg->uk_hash, mem); KEG_UNLOCK(keg, 0); return (slab); } static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (true); return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool uma_dbg_kskip(uma_keg_t keg, void *mem) { uintptr_t idx; if (dbg_divisor == 0) return (true); if (dbg_divisor == 1) return (false); idx = (uintptr_t)mem >> PAGE_SHIFT; if (keg->uk_ipers > 1) { idx *= keg->uk_ipers; idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; } if ((idx / dbg_divisor) * dbg_divisor != idx) { counter_u64_add(uma_skip_cnt, 1); return (true); } counter_u64_add(uma_dbg_cnt, 1); return (false); } /* * Set up the slab's freei data such that uma_dbg_free can function. * */ static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } /* * Verifies freed addresses. Checks for alignment, valid slab membership * and duplicate frees. * */ static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: Freed item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (freei >= keg->uk_ipers) panic("Invalid free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (slab_item(slab, keg, freei) != item) panic("Unaligned free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } #endif /* INVARIANTS */ #ifdef DDB static int64_t get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, uint64_t *sleeps, long *cachefree, uint64_t *xdomain) { uint64_t frees; int i; if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { *allocs = counter_u64_fetch(z->uz_allocs); frees = counter_u64_fetch(z->uz_frees); *sleeps = z->uz_sleeps; *cachefree = 0; *xdomain = 0; } else uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, xdomain); for (i = 0; i < vm_ndomains; i++) { *cachefree += ZDOM_GET(z, i)->uzd_nitems; if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) *cachefree += kz->uk_domain[i].ud_free_items; } *used = *allocs - frees; return (((int64_t)*used + *cachefree) * kz->uk_size); } -DB_SHOW_COMMAND(uma, db_show_uma) +DB_SHOW_COMMAND_FLAGS(uma, db_show_uma, DB_CMD_MEMSAFE) { const char *fmt_hdr, *fmt_entry; uma_keg_t kz; uma_zone_t z; uint64_t allocs, used, sleeps, xdomain; long cachefree; /* variables for sorting */ uma_keg_t cur_keg; uma_zone_t cur_zone, last_zone; int64_t cur_size, last_size, size; int ties; /* /i option produces machine-parseable CSV output */ if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; } else { fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; } db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket", "Total Mem", "XFree"); /* Sort the zones with largest size first. */ last_zone = NULL; last_size = INT64_MAX; for (;;) { cur_zone = NULL; cur_size = -1; ties = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { /* * In the case of size ties, print out zones * in the order they are encountered. That is, * when we encounter the most recently output * zone, we have already printed all preceding * ties, and we must print all following ties. */ if (z == last_zone) { ties = 1; continue; } size = get_uma_stats(kz, z, &allocs, &used, &sleeps, &cachefree, &xdomain); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_zone = z; cur_keg = kz; } } } if (cur_zone == NULL) break; size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, &sleeps, &cachefree, &xdomain); db_printf(fmt_entry, cur_zone->uz_name, (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, (uintmax_t)allocs, (uintmax_t)sleeps, (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, xdomain); if (db_pager_quit) return; last_zone = cur_zone; last_size = cur_size; } } -DB_SHOW_COMMAND(umacache, db_show_umacache) +DB_SHOW_COMMAND_FLAGS(umacache, db_show_umacache, DB_CMD_MEMSAFE) { uma_zone_t z; uint64_t allocs, frees; long cachefree; int i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); for (i = 0; i < vm_ndomains; i++) cachefree += ZDOM_GET(z, i)->uzd_nitems; db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_bucket_size); if (db_pager_quit) return; } } #endif /* DDB */ diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index ab9d21c3a0b8..06e463f3f753 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1,2859 +1,2859 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean); static void vm_object_backing_remove(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM object stats"); static COUNTER_U64_DEFINE_EARLY(object_collapses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static COUNTER_U64_DEFINE_EARLY(object_bypasses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static COUNTER_U64_DEFINE_EARLY(object_collapse_waits); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD, &object_collapse_waits, "Number of sleeps for collapse"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(!vm_object_busied(object), ("object %p busy = %d", object, blockcount_read(&object->busy))); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(atomic_load_int(&object->shadow_count) == 0, ("object %p shadow_count = %d", object, atomic_load_int(&object->shadow_count))); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); KASSERT(object->charge == 0 && object->cred == NULL, ("object %p has non-zero charge %ju (%p)", object, (uintmax_t)object->charge, object->cred)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; vm_radix_init(&object->rtree); refcount_init(&object->ref_count, 0); blockcount_init(&object->paging_in_progress); blockcount_init(&object->busy); object->resident_page_count = 0; atomic_store_int(&object->shadow_count, 0); object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags, vm_object_t object, void *handle) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; object->flags = flags; if ((flags & OBJ_SWAP) != 0) { pctrie_init(&object->un_pager.swp.swp_blks); object->un_pager.swp.writemappings = 0; } /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); object->pg_color = 0; object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->cleangeneration = 1; refcount_init(&object->ref_count, 1); object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = handle; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object, NULL); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif kernel_object->un_pager.phys.ops = &default_phys_pg_ops; /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. * * paging_in_progress is valid always. Lockless references to * the objects may acquire pip and then check OBJ_DEAD. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_DEAD) return (KERN_INVALID_ARGUMENT); if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { if (i > 0) blockcount_acquire(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { vm_object_pip_wakeupn(object, 1); } void vm_object_pip_wakeupn(vm_object_t object, short i) { if (i > 0) blockcount_release(&object->paging_in_progress, i); } /* * Atomically drop the object lock and wait for pip to drain. This protects * from sleep/wakeup races due to identity changes. The lock is not re-acquired * on return. */ static void vm_object_pip_sleep(vm_object_t object, const char *waitid) { (void)blockcount_sleep(&object->paging_in_progress, &object->lock, waitid, PVM | PDROP); } void vm_object_pip_wait(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); blockcount_wait(&object->paging_in_progress, &object->lock, waitid, PVM); } void vm_object_pip_wait_unlocked(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); blockcount_wait(&object->paging_in_progress, NULL, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; u_short flags; switch (type) { case OBJT_DEAD: panic("vm_object_allocate: can't create OBJT_DEAD"); case OBJT_SWAP: flags = OBJ_COLORED | OBJ_SWAP; break; case OBJT_DEVICE: case OBJT_SG: flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: flags = OBJ_UNMANAGED; break; case OBJT_VNODE: flags = 0; break; default: panic("vm_object_allocate: type %d is undefined or dynamic", type); } object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, flags, object, NULL); return (object); } vm_object_t vm_object_allocate_dyn(objtype_t dyntype, vm_pindex_t size, u_short flags) { vm_object_t object; MPASS(dyntype >= OBJT_FIRST_DYN /* && dyntype < nitems(pagertab) */); object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(dyntype, size, flags, object, NULL); return (object); } /* * vm_object_allocate_anon: * * Returns a new default object of the given size and marked as * anonymous memory for special split/collapse handling. Color * to be initialized by the caller. */ vm_object_t vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object, struct ucred *cred, vm_size_t charge) { vm_object_t handle, object; if (backing_object == NULL) handle = NULL; else if ((backing_object->flags & OBJ_ANON) != 0) handle = backing_object->handle; else handle = backing_object; object = uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(OBJT_SWAP, size, OBJ_ANON | OBJ_ONEMAPPING | OBJ_SWAP, object, handle); object->cred = cred; object->charge = cred != NULL ? charge : 0; return (object); } static void vm_object_reference_vnode(vm_object_t object) { u_int old; /* * vnode objects need the lock for the first reference * to serialize with vnode_object_deallocate(). */ if (!refcount_acquire_if_gt(&object->ref_count, 0)) { VM_OBJECT_RLOCK(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); VM_OBJECT_RUNLOCK(object); } } /* * vm_object_reference: * * Acquires a reference to the given object. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; if (object->type == OBJT_VNODE) vm_object_reference_vnode(object); else refcount_acquire(&object->ref_count); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { u_int old; VM_OBJECT_ASSERT_LOCKED(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_deallocate_vnode(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; bool last; KASSERT(object->type == OBJT_VNODE, ("vm_object_deallocate_vnode: not a vnode object")); KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp")); /* Object lock to protect handle lookup. */ last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (!last) return; if (!umtx_shm_vnobj_persistent) umtx_shm_object_terminated(object); /* vrele may need the vnode lock. */ vrele(vp); } /* * We dropped a reference on an object and discovered that it had a * single remaining shadow. This is a sibling of the reference we * dropped. Attempt to collapse the sibling and backing object. */ static vm_object_t vm_object_deallocate_anon(vm_object_t backing_object) { vm_object_t object; /* Fetch the final shadow. */ object = LIST_FIRST(&backing_object->shadow_head); KASSERT(object != NULL && atomic_load_int(&backing_object->shadow_count) == 1, ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d", backing_object->ref_count, atomic_load_int(&backing_object->shadow_count))); KASSERT((object->flags & OBJ_ANON) != 0, ("invalid shadow object %p", object)); if (!VM_OBJECT_TRYWLOCK(object)) { /* * Prevent object from disappearing since we do not have a * reference. */ vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); } else VM_OBJECT_WUNLOCK(backing_object); /* * Check for a collapse/terminate race with the last reference holder. */ if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 || !refcount_acquire_if_not_zero(&object->ref_count)) { VM_OBJECT_WUNLOCK(object); return (NULL); } backing_object = object->backing_object; if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0) vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); return (object); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; bool released; while (object != NULL) { /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. A ref count * of 1 may be a special case depending on the shadow count * being 0 or 1. These cases require a write lock on the * object. */ if ((object->flags & OBJ_ANON) == 0) released = refcount_release_if_gt(&object->ref_count, 1); else released = refcount_release_if_gt(&object->ref_count, 2); if (released) return; if (object->type == OBJT_VNODE) { VM_OBJECT_RLOCK(object); if (object->type == OBJT_VNODE) { vm_object_deallocate_vnode(object); return; } VM_OBJECT_RUNLOCK(object); } VM_OBJECT_WLOCK(object); KASSERT(object->ref_count > 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If this is not the final reference to an anonymous * object we may need to collapse the shadow chain. */ if (!refcount_release(&object->ref_count)) { if (object->ref_count > 1 || atomic_load_int(&object->shadow_count) == 0) { if ((object->flags & OBJ_ANON) != 0 && object->ref_count == 1) vm_object_set_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); return; } /* Handle collapsing last ref on anonymous objects. */ object = vm_object_deallocate_anon(object); continue; } /* * Handle the final reference to an object. We restart * the loop with the backing object to avoid recursion. */ umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT(object->type == OBJT_SWAP, ("shadowed tmpfs v_object 2 %p", object)); vm_object_backing_remove(object); } KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_deallocate: Terminating dead object.")); vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); object = temp; } } void vm_object_destroy(vm_object_t object) { uma_zfree(obj_zone, object); } static void vm_object_sub_shadow(vm_object_t object) { KASSERT(object->shadow_count >= 1, ("object %p sub_shadow count zero", object)); atomic_subtract_int(&object->shadow_count, 1); } static void vm_object_backing_remove_locked(vm_object_t object) { vm_object_t backing_object; backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("vm_object_backing_remove: Removing collapsing object.")); vm_object_sub_shadow(backing_object); if ((object->flags & OBJ_SHADOWLIST) != 0) { LIST_REMOVE(object, shadow_list); vm_object_clear_flag(object, OBJ_SHADOWLIST); } object->backing_object = NULL; } static void vm_object_backing_remove(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; if ((object->flags & OBJ_SHADOWLIST) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_remove_locked(object); VM_OBJECT_WUNLOCK(backing_object); } else { object->backing_object = NULL; vm_object_sub_shadow(backing_object); } } static void vm_object_backing_insert_locked(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); atomic_add_int(&backing_object->shadow_count, 1); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_ASSERT_WLOCKED(backing_object); LIST_INSERT_HEAD(&backing_object->shadow_head, object, shadow_list); vm_object_set_flag(object, OBJ_SHADOWLIST); } object->backing_object = backing_object; } static void vm_object_backing_insert(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_insert_locked(object, backing_object); VM_OBJECT_WUNLOCK(backing_object); } else { object->backing_object = backing_object; atomic_add_int(&backing_object->shadow_count, 1); } } /* * Insert an object into a backing_object's shadow list with an additional * reference to the backing_object added. */ static void vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); KASSERT((backing_object->flags & OBJ_DEAD) == 0, ("shadowing dead anonymous object")); vm_object_reference_locked(backing_object); vm_object_backing_insert_locked(object, backing_object); vm_object_clear_flag(backing_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(backing_object); } else { vm_object_reference(backing_object); atomic_add_int(&backing_object->shadow_count, 1); object->backing_object = backing_object; } } /* * Transfer a backing reference from backing_object to object. */ static void vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object) { vm_object_t new_backing_object; /* * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object == NULL) return; if ((new_backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(new_backing_object); vm_object_backing_remove_locked(backing_object); vm_object_backing_insert_locked(object, new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); } else { /* * shadow_count for new_backing_object is left * unchanged, its reference provided by backing_object * is replaced by object. */ object->backing_object = new_backing_object; backing_object->backing_object = NULL; } } /* * Wait for a concurrent collapse to settle. */ static void vm_object_collapse_wait(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while ((object->flags & OBJ_COLLAPSING) != 0) { vm_object_pip_wait(object, "vmcolwait"); counter_u64_add(object_collapse_waits, 1); } } /* * Waits for a backing object to clear a pending collapse and returns * it locked if it is an ANON object. */ static vm_object_t vm_object_backing_collapse_wait(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); for (;;) { backing_object = object->backing_object; if (backing_object == NULL || (backing_object->flags & OBJ_ANON) == 0) return (NULL); VM_OBJECT_WLOCK(backing_object); if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0) break; VM_OBJECT_WUNLOCK(object); vm_object_pip_sleep(backing_object, "vmbckwait"); counter_u64_add(object_collapse_waits, 1); VM_OBJECT_WLOCK(object); } return (backing_object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("terminating collapsing obj %p", object)); KASSERT(object->backing_object == NULL, ("terminating shadow obj %p", object)); /* * Wait for the pageout daemon and other current users to be * done with the object. Note that new paging_in_progress * users can come after this wait, but they must check * OBJ_DEAD flag set (without unlocking the object), and avoid * the object being terminated. */ vm_object_pip_wait(object, "objtrm"); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || (object->flags & OBJ_SWAP) != 0, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean) { vm_page_assert_busied(p); /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->a.flags & PGA_NOSYNC) != 0) { *allclean = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * For swap objects backing tmpfs regular files, do not flush anything, * but remove write protection on the mapped pages to update mtime through * mmaped writes. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t eio, res, allclean; VM_OBJECT_ASSERT_WLOCKED(object); if (!vm_object_mightbedirty(object) || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); allclean = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &allclean)) { vm_page_xunbusy(p); continue; } if (object->type == OBJT_VNODE) { n = vm_object_page_collect_flush(object, p, pagerflags, flags, &allclean, &eio); if (eio) { res = FALSE; allclean = FALSE; } if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; allclean = FALSE; } } else { n = 1; vm_page_xunbusy(p); } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif /* * Leave updating cleangeneration for tmpfs objects to tmpfs * scan. It needs to update mtime, which happens for other * filesystems during page writeouts. */ if (allclean && object->type == OBJT_VNODE) object->cleangeneration = curgeneration; return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); vm_page_assert_xbusied(p); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && vm_object_mightbedirty(object) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) == (OBJ_ONEMAPPING | OBJ_ANON)); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE) vm_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_SWAP objects, OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. The page * can not be invalidated while the object lock is held. */ if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_tryxbusy(tm) == 0) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } if (!vm_page_busy_sleep(tm, "madvpo", 0)) VM_OBJECT_WUNLOCK(tobject); goto relookup; } vm_page_advise(tm, advice); vm_page_xunbusy(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length, struct ucred *cred, bool shared) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. * * If we hold the only reference we can guarantee that it won't * increase while we have the map locked. Otherwise the race is * harmless and we will end up with an extra shadow object that * will be collapsed later. */ if (source != NULL && source->ref_count == 1 && (source->flags & OBJ_ANON) != 0) return; /* * Allocate a new object with the given length. */ result = vm_object_allocate_anon(atop(length), source, cred, length); /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (shared || source != NULL) { VM_OBJECT_WLOCK(result); /* * The new object shadows the source object, adding a * reference to it. Our caller changes his reference * to point to the new object, removing a reference to * the source object. Net result: no change of * reference count, unless the caller needs to add one * more reference due to forking a shared map entry. */ if (shared) { vm_object_reference_locked(result); vm_object_clear_flag(result, OBJ_ONEMAPPING); } /* * Try to optimize the result object's page color when * shadowing in order to maintain page coloring * consistency in the combined shadowed object. */ if (source != NULL) { vm_object_backing_insert(result, source); result->domain = source->domain; #if VM_NRESERVLEVEL > 0 vm_object_set_flag(result, (source->flags & OBJ_COLORED)); result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif } VM_OBJECT_WUNLOCK(result); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, backing_object; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0, ("vm_object_split: Splitting object with multiple mappings.")); if ((orig_object->flags & OBJ_ANON) == 0) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); new_object = vm_object_allocate_anon(size, orig_object, orig_object->cred, ptoa(size)); /* * We must wait for the orig_object to complete any in-progress * collapse so that the swap blocks are stable below. The * additional reference on backing_object by new object will * prevent further collapse operations until split completes. */ VM_OBJECT_WLOCK(orig_object); vm_object_collapse_wait(orig_object); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); new_object->domain = orig_object->domain; backing_object = orig_object->backing_object; if (backing_object != NULL) { vm_object_backing_insert_ref(new_object, backing_object); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; } if (orig_object->cred != NULL) { crhold(orig_object->cred); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } /* * Mark the split operation so that swap_pager_getpages() knows * that the object is in transition. */ vm_object_set_flag(orig_object, OBJ_SPLIT); #ifdef INVARIANTS idx = 0; #endif retry: m = vm_page_find_least(orig_object, offidxstart); KASSERT(m == NULL || idx <= m->pindex - offidxstart, ("%s: object %p was repopulated", __func__, orig_object)); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_tryxbusy(m) == 0) { VM_OBJECT_WUNLOCK(new_object); if (vm_page_busy_sleep(m, "spltwt", 0)) VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* * The page was left invalid. Likely placed there by * an incomplete fault. Just remove and ignore. */ if (vm_page_none_valid(m)) { if (vm_page_remove(m)) vm_page_free(m); continue; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif } /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); vm_object_clear_flag(orig_object, OBJ_SPLIT); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); /* The page is only NULL when rename fails. */ if (p == NULL) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } else if (p->object == object) { VM_OBJECT_WUNLOCK(backing_object); if (vm_page_busy_sleep(p, "vmocol", 0)) VM_OBJECT_WLOCK(object); } else { VM_OBJECT_WUNLOCK(object); if (!vm_page_busy_sleep(p, "vmocol", 0)) VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); } VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if ((backing_object->flags & OBJ_ANON) == 0) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; if (p != NULL) { /* * If the backing object page is busy a * grandparent or older page may still be * undergoing CoW. It is not safe to collapse * the backing object until it is quiesced. */ if (vm_page_tryxbusy(p) == 0) return (false); /* * We raced with the fault handler that left * newly allocated invalid page on the object * queue and retried. */ if (!vm_page_all_valid(p)) goto unbusy_ret; } /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); /* * The valid check here is stable due to object lock * being required to clear valid and initiate paging. * Busy of p disallows fault handler to validate pp. */ if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) goto unbusy_ret; if (p != NULL) vm_page_xunbusy(p); } return (true); unbusy_ret: if (p != NULL) vm_page_xunbusy(p); return (false); } static void vm_object_collapse_scan(vm_object_t object) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_tryxbusy(p) == 0) { next = vm_object_collapse_scan_wait(object, p); continue; } KASSERT(object->backing_object == backing_object, ("vm_object_collapse_scan: backing object mismatch %p != %p", object->backing_object, backing_object)); KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch %p != %p", p->object, backing_object)); if (p->pindex < backing_offset_index || new_pindex >= object->size) { vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } if (!vm_page_all_valid(p)) { KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_tryxbusy(pp) == 0) { vm_page_xunbusy(p); /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. */ next = vm_object_collapse_scan_wait(object, pp); continue; } if (pp != NULL && vm_page_none_valid(pp)) { /* * The page was invalid in the parent. Likely placed * there by an incomplete fault. Just remove and * ignore. p can replace it. */ if (vm_page_remove(pp)) vm_page_free(pp); pp = NULL; } if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); if (pp != NULL) vm_page_xunbusy(pp); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { vm_page_xunbusy(p); next = vm_object_collapse_scan_wait(object, NULL); continue; } /* Use the old pindex to free the right page. */ vm_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif vm_page_xunbusy(p); } return; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON, ("collapsing invalid object")); /* * Wait for the backing_object to finish any pending * collapse so that the caller sees the shortest possible * shadow chain. */ backing_object = vm_object_backing_collapse_wait(object); if (backing_object == NULL) return; KASSERT(object->ref_count > 0 && object->ref_count > atomic_load_int(&object->shadow_count), ("collapse with invalid ref %d or shadow %d count.", object->ref_count, atomic_load_int(&object->shadow_count))); KASSERT((backing_object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: Backing object already collapsing.")); KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: object is already collapsing.")); /* * We know that we can either collapse the backing object if * the parent is the only reference to it, or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. */ if (backing_object->ref_count == 1) { KASSERT(atomic_load_int(&backing_object->shadow_count) == 1, ("vm_object_collapse: shadow_count: %d", atomic_load_int(&backing_object->shadow_count))); vm_object_pip_add(object, 1); vm_object_set_flag(object, OBJ_COLLAPSING); vm_object_pip_add(backing_object, 1); vm_object_set_flag(backing_object, OBJ_DEAD); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. * * swap_pager_copy() can sleep, in which case the * backing_object's and object's locks are released and * reacquired. */ swap_pager_copy(backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); /* * Object now shadows whatever backing_object did. */ vm_object_clear_flag(object, OBJ_COLLAPSING); vm_object_backing_transfer(object, backing_object); object->backing_object_offset += backing_object->backing_object_offset; VM_OBJECT_WUNLOCK(object); vm_object_pip_wakeup(object); /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); (void)refcount_release(&backing_object->ref_count); vm_object_terminate(backing_object); counter_u64_add(object_collapses, 1); VM_OBJECT_WLOCK(object); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. * * The object lock and backing_object lock must not * be dropped during this sequence. */ if (!vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object != NULL) { vm_object_backing_insert_ref(object, new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ (void)refcount_release(&backing_object->ref_count); KASSERT(backing_object->ref_count >= 1, ( "backing_object %p was somehow dereferenced during collapse!", backing_object)); VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Skip invalid pages if asked to do so. Try to avoid acquiring * the busy lock, as some consumers rely on this to avoid * deadlocks. * * A thread may concurrently transition the page from invalid to * valid using only the busy lock, so the result of this check * is immediately stale. It is up to consumers to handle this, * for instance by ensuring that all invalid->valid transitions * happen with a mutex held, as may be possible for a * filesystem. */ if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p)) continue; /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_tryxbusy(p) == 0) { if (vm_page_busy_sleep(p, "vmopar", 0)) VM_OBJECT_WLOCK(object); goto again; } if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p)) { vm_page_xunbusy(p); continue; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) { vm_page_xunbusy(p); continue; } } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); vm_pager_freespace(object, start, (end == 0 ? object->size : end) - start); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_deactivate_noreuse(p); } } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); if ((prev_object->flags & OBJ_ANON) == 0) return (FALSE); VM_OBJECT_WLOCK(prev_object); /* * Try to collapse the object first. */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty_(vm_object_t object) { atomic_add_int(&object->generation, 1); } bool vm_object_mightbedirty_(vm_object_t object) { return (object->generation != object->cleangeneration); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_trysbusy(tm) == 0) { for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; if (tm->object != tobject) VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } tobject = tm->object; if (!vm_page_busy_sleep(tm, "unwbo", VM_ALLOC_IGN_SBUSY)) VM_OBJECT_RUNLOCK(tobject); goto again; } vm_page_unwire(tm, queue); vm_page_sunbusy(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); vm_pager_getvp(object, &vp, NULL); return (vp); } /* * Busy the vm object. This prevents new pages belonging to the object from * becoming busy. Existing pages persist as busy. Callers are responsible * for checking page state before proceeding. */ void vm_object_busy(vm_object_t obj) { VM_OBJECT_ASSERT_LOCKED(obj); blockcount_acquire(&obj->busy, 1); /* The fence is required to order loads of page busy. */ atomic_thread_fence_acq_rel(); } void vm_object_unbusy(vm_object_t obj) { blockcount_release(&obj->busy, 1); } void vm_object_busy_wait(vm_object_t obj, const char *wmesg) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)blockcount_sleep(&obj->busy, NULL, wmesg, PVM); } /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ bool vm_object_is_active(vm_object_t obj) { return (obj->ref_count > atomic_load_int(&obj->shadow_count)); } static int vm_object_list_handler(struct sysctl_req *req, bool swap_only) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; u_long sp; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD || (swap_only && (obj->flags & (OBJ_ANON | OBJ_SWAP)) == 0)) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD || (swap_only && (obj->flags & (OBJ_ANON | OBJ_SWAP)) == 0)) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = atomic_load_int(&obj->shadow_count); kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; if (!swap_only) { TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->a.queue == PQ_ACTIVE) kvo->kvo_active++; else if (m->a.queue == PQ_INACTIVE) kvo->kvo_inactive++; } } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; vp = NULL; kvo->kvo_type = vm_object_kvme_type(obj, swap_only ? NULL : &vp); if (vp != NULL) { vref(vp); } else if ((obj->flags & OBJ_ANON) != 0) { MPASS(kvo->kvo_type == KVME_TYPE_SWAP); kvo->kvo_me = (uintptr_t)obj; /* tmpfs objs are reported as vnodes */ kvo->kvo_backing_obj = (uintptr_t)obj->backing_object; sp = swap_pager_swapped_pages(obj); kvo->kvo_swapped = sp > UINT32_MAX ? UINT32_MAX : sp; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); maybe_yield(); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { return (vm_object_list_handler(req, false)); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); static int sysctl_vm_object_list_swap(SYSCTL_HANDLER_ARGS) { return (vm_object_list_handler(req, true)); } /* * This sysctl returns list of the anonymous or swap objects. Intent * is to provide stripped optimized list useful to analyze swap use. * Since technically non-swap (default) objects participate in the * shadow chains, and are converted to swap type as needed by swap * pager, we must report them. */ SYSCTL_PROC(_vm, OID_AUTO, swap_objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list_swap, "S,kinfo_vmobject", "List of swap VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (map == 0) return 0; if (entry == 0) { VM_MAP_ENTRY_FOREACH(tmpe, map) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; VM_MAP_ENTRY_FOREACH(tmpe, tmpm) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } -DB_SHOW_COMMAND(vmochk, vm_object_check) +DB_SHOW_COMMAND_FLAGS(vmochk, vm_object_check, DB_CMD_MEMSAFE) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if ((object->flags & OBJ_ANON) != 0) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } if (db_pager_quit) return; } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", atomic_load_int(&object->shadow_count), object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); if (db_pager_quit) break; } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } -DB_SHOW_COMMAND(vmopag, vm_object_print_pages) +DB_SHOW_COMMAND_FLAGS(vmopag, vm_object_print_pages, DB_CMD_MEMSAFE) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (db_pager_quit) return; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; } } } #endif /* DDB */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index cb7ce428db28..e7500e9d3e71 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1,5593 +1,5593 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM page statistics"); static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, CTLFLAG_RD, &pqstate_commit_retries, "Number of failed per-page atomic queue state updates"); static COUNTER_U64_DEFINE_EARLY(queue_ops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, &queue_ops, "Number of batched queue operations"); static COUNTER_U64_DEFINE_EARLY(queue_nops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, CTLFLAG_RD, &queue_nops, "Number of batched queue operations with no effects"); /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; struct bitset *vm_page_dump; long vm_page_dump_pages; static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static bool vm_page_free_prep(vm_page_t m); static void vm_page_free_toq(vm_page_t m); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, const uint16_t nflag); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int cache, domain, maxcache, pool; maxcache = 0; TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &maxcache); maxcache *= mp_ncpus; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", PAGE_SIZE, NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_VM); /* * Limit each pool's zone to 0.1% of the pages in the * domain. */ cache = maxcache != 0 ? maxcache : vmd->vmd_page_count / 1000; uma_zone_set_maxcache(pgcache->zone, cache); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->a.flags = aflags; marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; marker->a.queue = queue; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_FREED; m->flags = m->a.flags = 0; m->phys_addr = pa; m->a.queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } #ifndef PMAP_HAS_PAGE_ARRAY static vm_paddr_t vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) { vm_paddr_t new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ *vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = page_range; return (new_end); } #endif /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; struct vm_domain *vmd; vm_page_t m; char *list, *listend; vm_paddr_t end, high_avail, low_avail, new_end, size; vm_paddr_t page_range __unused; vm_paddr_t last_pa, pa, startp, endp; u_long pagecount; #if MINIDUMP_PAGE_TRACKING u_long vm_page_dump_size; #endif int biggestone, i, segind; #ifdef WITNESS vm_offset_t mapped; int witness_size; #endif #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif vaddr = round_page(vaddr); vm_phys_early_startup(); biggestone = vm_phys_avail_largest(); end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); new_end = end; #ifdef WITNESS witness_size = round_page(witness_startup_count()); new_end -= witness_size; mapped = pmap_map(&vaddr, new_end, new_end + witness_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, witness_size); witness_startup((void *)mapped); #endif #if MINIDUMP_PAGE_TRACKING /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; vm_page_dump_pages = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) { vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - dump_avail[i] / PAGE_SIZE; if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; } vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include the UMA bootstrap pages, witness pages and vm_page_dump * in a crash dump. When pmap_map() uses the direct map, they are * not automatically included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef PMAP_HAS_PAGE_ARRAY pmap_page_array_startup(size / PAGE_SIZE); biggestone = vm_phys_avail_largest(); end = new_end = phys_avail[biggestone + 1]; #else #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; new_end = vm_page_array_alloc(&vaddr, end, page_range); #endif #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ new_end = vm_reserv_startup(&vaddr, new_end); #endif #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) if (vm_phys_avail_size(i) != 0) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment's pages that are covered by one of * phys_avail's ranges to the free lists. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (seg->end <= phys_avail[i] || seg->start >= phys_avail[i + 1]) continue; startp = MAX(seg->start, phys_avail[i]); endp = MIN(seg->end, phys_avail[i + 1]); pagecount = (u_long)atop(endp - startp); if (pagecount == 0) continue; m = seg->first_page + atop(startp - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << segind; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_trybusy * * Helper routine for grab functions to trylock busy. * * Returns true on success and false on failure. */ static bool vm_page_trybusy(vm_page_t m, int allocflags) { if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) return (vm_page_trysbusy(m)); else return (vm_page_tryxbusy(m)); } /* * vm_page_tryacquire * * Helper routine for grab functions to trylock busy and wire. * * Returns true on success and false on failure. */ static inline bool vm_page_tryacquire(vm_page_t m, int allocflags) { bool locked; locked = vm_page_trybusy(m, allocflags); if (locked && (allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); return (locked); } /* * vm_page_busy_acquire: * * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop * and drop the object lock if necessary. */ bool vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; bool locked; /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = atomic_load_ptr(&m->object); for (;;) { if (vm_page_tryacquire(m, allocflags)) return (true); if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); if (obj != NULL) locked = VM_OBJECT_WOWNED(obj); else locked = false; MPASS(locked || vm_page_wired(m)); if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, locked) && locked) VM_OBJECT_WLOCK(obj); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); KASSERT(m->object == obj || m->object == NULL, ("vm_page_busy_acquire: page %p does not belong to %p", m, obj)); } } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); x = vm_page_busy_fetch(m); for (;;) { if (atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_SHARERS_WORD(1))) break; } if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * * vm_page_busy_tryupgrade: * * Attempt to upgrade a single shared busy into an exclusive busy. */ int vm_page_busy_tryupgrade(vm_page_t m) { u_int ce, x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); ce = VPB_CURTHREAD_EXCLUSIVE; for (;;) { if (VPB_SHARERS(x) > 1) return (0); KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_busy_tryupgrade: invalid lock state")); if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, ce | (x & VPB_BIT_WAITERS))) continue; return (1); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); for (;;) { KASSERT(x != VPB_FREED, ("vm_page_sunbusy: Unlocking freed page.")); if (VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) == 0) break; wakeup(m); break; } } /* * vm_page_busy_sleep: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of the busying mechanism. * * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function * will not sleep if the page is shared-busy. * * The object lock must be held on entry. * * Returns true if it slept and dropped the object lock, or false * if there was no sleep and the lock is still held. */ bool vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags) { vm_object_t obj; obj = m->object; VM_OBJECT_ASSERT_LOCKED(obj); return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags, true)); } /* * vm_page_busy_sleep_unlocked: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function * will not sleep if the page is shared-busy. * * The object lock must not be held on entry. The operation will * return if the page changes identity. */ void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false); } /* * _vm_page_busy_sleep: * * Internal busy sleep function. Verifies the page identity and * lockstate against parameters. Returns true if it sleeps and * false otherwise. * * allocflags uses VM_ALLOC_* flags to specify the lock required. * * If locked is true the lock will be dropped for any true returns * and held for any false returns. */ static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { bool xsleep; u_int x; /* * If the object is busy we must wait for that to drain to zero * before trying the page again. */ if (obj != NULL && vm_object_busied(obj)) { if (locked) VM_OBJECT_DROP(obj); vm_object_busy_wait(obj, wmesg); return (true); } if (!vm_page_busied(m)) return (false); xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; sleepq_lock(m); x = vm_page_busy_fetch(m); do { /* * If the page changes objects or becomes unlocked we can * simply return. */ if (x == VPB_UNBUSIED || (xsleep && (x & VPB_BIT_SHARED) != 0) || m->object != obj || m->pindex != pindex) { sleepq_release(m); return (false); } if ((x & VPB_BIT_WAITERS) != 0) break; } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); if (locked) VM_OBJECT_DROP(obj); DROP_GIANT(); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); PICKUP_GIANT(); return (true); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { vm_object_t obj; u_int x; obj = m->object; x = vm_page_busy_fetch(m); for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); /* * Reduce the window for transient busies that will trigger * false negatives in vm_page_ps_test(). */ if (obj != NULL && vm_object_busied(obj)) return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) break; } /* Refetch the object now that we're guaranteed that it is stable. */ obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_sunbusy(m); return (0); } return (1); } /* * vm_page_tryxbusy: * * Try to exclusive busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_tryxbusy(vm_page_t m) { vm_object_t obj; if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, VPB_CURTHREAD_EXCLUSIVE) == 0) return (0); obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_xunbusy(m); return (0); } return (1); } static void vm_page_xunbusy_hard_tail(vm_page_t m) { atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* Wake the waiter. */ wakeup(m); } /* * vm_page_xunbusy_hard: * * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_xunbusy_hard_tail(m); } void vm_page_xunbusy_hard_unchecked(vm_page_t m) { vm_page_assert_xbusied_unchecked(m); vm_page_xunbusy_hard_tail(m); } static void vm_page_busy_free(vm_page_t m) { u_int x; atomic_thread_fence_rel(); x = atomic_swap_int(&m->busy_lock, VPB_FREED); if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { for (; count != 0; count--) { vm_page_unwire(*ma, PQ_ACTIVE); ma++; } } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->a.queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; /* Fictitious pages are unevictable. */ m->ref_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); vm_page_assert_xbusied(m); vm_page_busy_free(m); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_xunbusy_unchecked(m); } /* * Destroy the identity of an invalid page and free it if possible. * This is intended to be used when reading a page from backing store fails. */ void vm_page_free_invalid(vm_page_t m) { KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); KASSERT(m->object != NULL, ("page %p has no object", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * We may be attempting to free the page as part of the handling for an * I/O error, in which case the page was xbusied by a different thread. */ vm_page_xbusy_claim(m); /* * If someone has wired this page while the object lock * was not held, then the thread that unwires is responsible * for freeing the page. Otherwise just free the page now. * The wire count of this unmapped page cannot change while * we have the page xbusy and the page's object wlocked. */ if (vm_page_remove(m)) vm_page_free(m); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page. */ m->object = object; m->pindex = pindex; m->ref_count |= VPRC_OBJREF; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; m->ref_count &= ~VPRC_OBJREF; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("vm_page_insert_radixdone: page %p is missing object ref", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_radixdone: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_radixdone: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's generation count. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * Do the work to remove a page from its object. The caller is responsible for * updating the page's fields to reflect this removal. */ static void vm_page_object_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem __diagused; vm_page_assert_xbusied(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); m->object = NULL; mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Returns true if the object's reference * was the last reference to the page, and false otherwise. * * The object must be locked and the page must be exclusively busied. * The exclusive busy will be released on return. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ bool vm_page_remove(vm_page_t m) { bool dropped; dropped = vm_page_remove_xbusy(m); vm_page_xunbusy(m); return (dropped); } /* * vm_page_remove_xbusy * * Removes the page but leaves the xbusy held. Returns true if this * removed the final ref and false otherwise. */ bool vm_page_remove_xbusy(vm_page_t m) { vm_page_object_remove(m); return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_lookup_unlocked: * * Returns the page associated with the object/offset pair specified; * if none is found, NULL is returned. The page may be no longer be * present in the object at the time that this function returns. Only * useful for opportunistic checks such as inmem(). */ vm_page_t vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) { return (vm_radix_lookup_unlocked(&object->rtree, pindex)); } /* * vm_page_relookup: * * Returns a page that must already have been busied by * the caller. Used for bogus page replacement. */ vm_page_t vm_page_relookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; m = vm_radix_lookup_unlocked(&object->rtree, pindex); KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && m->object == object && m->pindex == pindex, ("vm_page_relookup: Invalid page %p", m)); return (m); } /* * This should only be used by lockless functions for releasing transient * incorrect acquires. The page may have been freed after we acquired a * busy lock. In this case busy_lock == VPB_FREED and we have nothing * further to do. */ static void vm_page_busy_release(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); for (;;) { if (x == VPB_FREED) break; if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & VPB_BIT_SHARED) != 0 || (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, ("vm_page_busy_release: %p xbusy not owned.", m)); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); break; } } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * Both pages must be exclusively busied on enter. The old page is * unbusied on exit. * * A return value of true means mold is now free. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ static bool vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret __diagused; bool dropped; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_xbusied(mold); KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, ("vm_page_replace: page %p already in object", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mret = vm_radix_replace(&object->rtree, mnew); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); KASSERT((mold->oflags & VPO_UNMANAGED) == (mnew->oflags & VPO_UNMANAGED), ("vm_page_replace: mismatched VPO_UNMANAGED")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; /* * The object's resident_page_count does not change because we have * swapped one page for another, but the generation count should * change if the page is dirty. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; vm_page_xunbusy(mold); return (dropped); } void vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_assert_xbusied(mnew); if (vm_page_replace_hold(mnew, object, pindex, mold)) vm_page_free(mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_object_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, vm_radix_lookup_le(&object->rtree, pindex))); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, vm_radix_lookup_le(&object->rtree, pindex))); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ static int _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) { u_int limit, old, new; if (req_class == VM_ALLOC_INTERRUPT) limit = 0; else if (req_class == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { int req_class; /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; return (_vm_domain_allocate(vmd, req_class, npages)); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags; #define VPA_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY | \ VM_ALLOC_SBUSY | VM_ALLOC_WIRED | \ VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) KASSERT((req & ~VPA_FLAGS) == 0, ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags |= m->flags & PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->a.flags = 0; m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); else m->busy_lock = VPB_UNBUSIED; if (req & VM_ALLOC_WIRED) { vm_wire_add(1); m->ref_count = 1; } m->a.act_count = 0; if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->ref_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } static vm_page_t vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_page_t m_ret; /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ vmd = VM_DOMAIN(domain); if (!vm_domain_allocate(vmd, req, npages)) return (NULL); /* * Try to allocate the pages from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret != NULL) return (m_ret); #if VM_NRESERVLEVEL > 0 /* * Try to break a reservation to allocate the pages. */ if ((req & VM_ALLOC_NORECLAIM) == 0) { m_ret = vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary); if (m_ret != NULL) return (m_ret); } #endif vm_domain_freecnt_inc(vmd, npages); return (NULL); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; #define VPAC_FLAGS (VPA_FLAGS | VM_ALLOC_NORECLAIM) KASSERT((req & ~VPAC_FLAGS) == 0, ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), ("invalid request %#x", req)); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); for (;;) { #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, mpred, npages, low, high, alignment, boundary)) != NULL) { break; } #endif if ((m_ret = vm_page_find_contig_domain(domain, req, npages, low, high, alignment, boundary)) != NULL) break; if (!vm_domain_alloc_fail(VM_DOMAIN(domain), object, req)) return (NULL); } for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); else busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; for (m = m_ret; m < &m_ret[npages]; m++) { m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = oflags; if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->ref_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Allocate a physical page that is not intended to be inserted into a VM * object. If the "freelist" parameter is not equal to VM_NFREELIST, then only * pages from the specified vm_phys freelist will be returned. */ static __always_inline vm_page_t _vm_page_alloc_noobj_domain(int domain, const int freelist, int req) { struct vm_domain *vmd; vm_page_t m; int flags; #define VPAN_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | \ VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | \ VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) KASSERT((req & ~VPAN_FLAGS) == 0, ("invalid request %#x", req)); flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0; vmd = VM_DOMAIN(domain); again: if (freelist == VM_NFREELIST && vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); if (freelist == VM_NFREELIST) m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0); else m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (freelist == VM_NFREELIST && vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Consumers should not rely on a useful default pindex value. */ m->pindex = 0xdeadc0dedeadc0de; m->flags = (m->flags & PG_ZERO) | flags; m->a.flags = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) { vm_wire_add(1); m->ref_count = 1; } if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { KASSERT(freelist >= 0 && freelist < VM_NFREELIST, ("%s: invalid freelist %d", __func__, freelist)); return (_vm_page_alloc_noobj_domain(domain, freelist, req)); } vm_page_t vm_page_alloc_noobj(int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_noobj_domain(int domain, int req) { return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req)); } vm_page_t vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret; u_int flags; #define VPANC_FLAGS (VPAN_FLAGS | VM_ALLOC_NORECLAIM) KASSERT((req & ~VPANC_FLAGS) == 0, ("invalid request %#x", req)); KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); while ((m_ret = vm_page_find_contig_domain(domain, req, npages, low, high, alignment, boundary)) == NULL) { if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req)) return (NULL); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); /* * Consumers should not rely on a useful default pindex value. */ m->pindex = 0xdeadc0dedeadc0de; m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = VPO_UNMANAGED; /* * Zero the page before updating any mappings since the page is * not yet shared with any devices which might require the * non-default memory attribute. pmap_page_set_memattr() * flushes data caches before returning. */ if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); } return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); pmap_vm_page_alloc_check(m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* * The page daemon should avoid creating extra memory pressure since its * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, ("fictitious page %p has invalid ref count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if (!vm_addr_align_ok(pa, alignment)) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); retry: m_inc = 1; if (vm_page_wired(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ VM_OBJECT_RLOCK(object); if (object != m->object) { VM_OBJECT_RUNLOCK(object); goto retry; } /* Don't care: PG_NODUMP, PG_ZERO. */ if ((object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ run_ext = 1; } else run_ext = 0; VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page busy lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Racily check for wirings. Races are handled once the object * lock is held and the page is unmapped. */ if (vm_page_wired(m)) error = EBUSY; else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ VM_OBJECT_WLOCK(object); /* Don't care: PG_NODUMP, PG_ZERO. */ if (m->object != object || ((object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE)) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (vm_page_queue(m) != PQ_NONE && vm_page_tryxbusy(m) != 0) { if (vm_page_wired(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock; } KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT(m->oflags == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ if (!vm_page_none_valid(m)) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_noobj_contig( req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_noobj_contig( req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_noobj_contig( req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { vm_page_xunbusy(m); error = ENOMEM; goto unlock; } /* * Unmap the page and check for new * wirings that may have been acquired * through a pmap lookup. */ if (object->ref_count != 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); vm_page_free(m_new); error = EBUSY; goto unlock; } /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ m_new->a.flags = m->a.flags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = 0; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_replace_hold(m_new, object, m->pindex, m) && vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_page_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_page_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, minalign, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); /* * The caller will attempt an allocation after some runs have been * reclaimed and added to the vm_phys buddy lists. Due to limitations * of vm_phys_alloc_contig(), round up the requested length to the next * power of two or maximum chunk size, and ensure that each run is * suitably aligned. */ minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1); npages = roundup2(npages, minalign); if (alignment < ptoa(minalign)) alignment = ptoa(minalign); /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } int vm_wait_doms(const domainset_t *wdoms, int mflags) { int error; error = 0; /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP | mflags, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { if (pageproc == NULL) panic("vm_wait in early boot"); vm_min_waiters++; error = msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP | mflags, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } return (error); } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom, 0); } } static int vm_wait_flags(vm_object_t obj, int mflags) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; return (vm_wait_doms(&d->ds_mask, mflags)); } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { (void)vm_wait_flags(obj, 0); } int vm_wait_intr(vm_object_t obj) { return (vm_wait_flags(obj, PCATCH)); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } static struct vm_pagequeue * _vm_page_pagequeue(vm_page_t m, uint8_t queue) { return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } #ifdef INVARIANTS static struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); } #endif static __always_inline bool vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_astate_t tmp; tmp = *old; do { if (__predict_true(vm_page_astate_fcmpset(m, old, new))) return (true); counter_u64_add(pqstate_commit_retries, 1); } while (old->_bits == tmp._bits); return (false); } /* * Do the work of committing a queue state update that moves the page out of * its current queue. */ static bool _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_t next; vm_pagequeue_assert_locked(pq); KASSERT(vm_page_pagequeue(m) == pq, ("%s: queue %p does not match page %p", __func__, pq, m)); KASSERT(old->queue != PQ_NONE && new.queue != old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); /* * Once the queue index of the page changes there is nothing * synchronizing with further updates to the page's physical * queue state. Therefore we must speculatively remove the page * from the queue now and be prepared to roll back if the queue * state update fails. If the page is not physically enqueued then * we just update its queue index. */ if ((old->flags & PGA_ENQUEUED) != 0) { new.flags &= ~PGA_ENQUEUED; next = TAILQ_NEXT(m, plinks.q); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); if (!vm_page_pqstate_fcmpset(m, old, new)) { if (next == NULL) TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); else TAILQ_INSERT_BEFORE(next, m, plinks.q); vm_pagequeue_cnt_inc(pq); return (false); } else { return (true); } } else { return (vm_page_pqstate_fcmpset(m, old, new)); } } static bool vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_pagequeue *pq; vm_page_astate_t as; bool ret; pq = _vm_page_pagequeue(m, old->queue); /* * The queue field and PGA_ENQUEUED flag are stable only so long as the * corresponding page queue lock is held. */ vm_pagequeue_lock(pq); as = vm_page_astate_load(m); if (__predict_false(as._bits != old->_bits)) { *old = as; ret = false; } else { ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); } vm_pagequeue_unlock(pq); return (ret); } /* * Commit a queue state update that enqueues or requeues a page. */ static bool _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_domain *vmd; vm_pagequeue_assert_locked(pq); KASSERT(old->queue != PQ_NONE && new.queue == old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); new.flags |= PGA_ENQUEUED; if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if ((old->flags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); else vm_pagequeue_cnt_inc(pq); /* * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if * both flags are set in close succession, only PGA_REQUEUE_HEAD will be * applied, even if it was set first. */ if ((old->flags & PGA_REQUEUE_HEAD) != 0) { vmd = vm_pagequeue_domain(m); KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], ("%s: invalid page queue for page %p", __func__, m)); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); } else { TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } return (true); } /* * Commit a queue state update that encodes a request for a deferred queue * operation. */ static bool vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { KASSERT(old->queue == new.queue || new.queue != PQ_NONE, ("%s: invalid state, queue %d flags %x", __func__, new.queue, new.flags)); if (old->_bits != new._bits && !vm_page_pqstate_fcmpset(m, old, new)) return (false); vm_page_pqbatch_submit(m, new.queue); return (true); } /* * A generic queue state update function. This handles more cases than the * specialized functions above. */ bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { if (old->_bits == new._bits) return (true); if (old->queue != PQ_NONE && new.queue != old->queue) { if (!vm_page_pqstate_commit_dequeue(m, old, new)) return (false); if (new.queue != PQ_NONE) vm_page_pqbatch_submit(m, new.queue); } else { if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if (new.queue != PQ_NONE && ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) vm_page_pqbatch_submit(m, new.queue); } return (true); } /* * Apply deferred queue state updates to a page. */ static inline void vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) { vm_page_astate_t new, old; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); KASSERT(queue < PQ_COUNT, ("%s: invalid queue index %d", __func__, queue)); KASSERT(pq == _vm_page_pagequeue(m, queue), ("%s: page %p does not belong to queue %p", __func__, m, pq)); for (old = vm_page_astate_load(m);;) { if (__predict_false(old.queue != queue || (old.flags & PGA_QUEUE_OP_MASK) == 0)) { counter_u64_add(queue_nops, 1); break; } KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); new = old; if ((old.flags & PGA_DEQUEUE) != 0) { new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } else { new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); if (__predict_true(_vm_page_pqstate_commit_requeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { int i; for (i = 0; i < bq->bq_cnt; i++) vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); vm_batchqueue_init(bq); } /* * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. * The caller must have encoded the requested operation in the page * structure's a.flags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_page_domain(m); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } critical_exit(); pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); vm_pqbatch_process(pq, bq, queue); vm_pqbatch_process_page(pq, m, queue); vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_pqbatch_drain: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_pqbatch_drain(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } /* * vm_page_dequeue_deferred: [ internal use only ] * * Request removal of the given page from its current page * queue. Physical removal from the queue may be deferred * indefinitely. */ void vm_page_dequeue_deferred(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags |= PGA_DEQUEUE; } while (!vm_page_pqstate_commit_request(m, &old, new)); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any, before * returning. */ void vm_page_dequeue(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); m->a.queue = queue; if ((m->a.flags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object, if it exists, must be locked, and then the page must * be xbusy. Otherwise the page must be not busied. A managed * page must be unmapped. */ static bool vm_page_free_prep(vm_page_t m) { /* * Synchronize with threads that have dropped a reference to this * page. */ atomic_thread_fence_acq(); #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { KASSERT(m->a.queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); if (m->object != NULL) { KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == ((m->object->flags & OBJ_UNMANAGED) != 0), ("vm_page_free_prep: managed flag mismatch for page %p", m)); vm_page_assert_xbusied(m); /* * The object reference can be released without an atomic * operation. */ KASSERT((m->flags & PG_FICTITIOUS) != 0 || m->ref_count == VPRC_OBJREF, ("vm_page_free_prep: page %p has unexpected ref_count %u", m, m->ref_count)); vm_page_object_remove(m); m->ref_count -= VPRC_OBJREF; } else vm_page_assert_unbusied(m); vm_page_busy_free(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); KASSERT(m->a.queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_dequeue_deferred(m); m->valid = 0; vm_page_undirty(m); if (m->ref_count != 0) panic("vm_page_free_prep: page %p has references", m); /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be exclusively busied if it * belongs to an object. */ static void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * Mark this page as wired down. For managed pages, this prevents reclamation * by the page daemon, or when the containing object, if any, is destroyed. */ void vm_page_wire(vm_page_t m) { u_int old; #ifdef INVARIANTS if (m->object != NULL && !vm_page_busied(m) && !vm_object_busied(m->object)) VM_OBJECT_ASSERT_LOCKED(m->object); #endif KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(m->ref_count) >= 1, ("vm_page_wire: fictitious page %p has zero wirings", m)); old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } } /* * Attempt to wire a mapped page following a pmap lookup of that page. * This may fail if a thread is concurrently tearing down mappings of the page. * The transient failure is acceptable because it translates to the * failure of the caller pmap_extract_and_hold(), which should be then * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). */ bool vm_page_wire_mapped(vm_page_t m) { u_int old; old = m->ref_count; do { KASSERT(old > 0, ("vm_page_wire_mapped: wiring unreferenced page %p", m)); if ((old & VPRC_BLOCKED) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } return (true); } /* * Release a wiring reference to a managed page. If the page still belongs to * an object, update its position in the page queues to reflect the reference. * If the wiring was the last reference to the page, free the page. */ static void vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) { u_int old; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); /* * Update LRU state before releasing the wiring reference. * Use a release store when updating the reference count to * synchronize with vm_page_free_prep(). */ old = m->ref_count; do { KASSERT(VPRC_WIRE_COUNT(old) > 0, ("vm_page_unwire: wire count underflow for page %p", m)); if (old > VPRC_OBJREF + 1) { /* * The page has at least one other wiring reference. An * earlier iteration of this loop may have called * vm_page_release_toq() and cleared PGA_DEQUEUE, so * re-set it if necessary. */ if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); } else if (old == VPRC_OBJREF + 1) { /* * This is the last wiring. Clear PGA_DEQUEUE and * update the page's queue state to reflect the * reference. If the page does not belong to an object * (i.e., the VPRC_OBJREF bit is clear), we only need to * clear leftover queue state. */ vm_page_release_toq(m, nqueue, noreuse); } else if (old == 1) { vm_page_aflag_clear(m, PGA_DEQUEUE); } } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); if (VPRC_WIRE_COUNT(old) == 1) { vm_wire_sub(1); if (old == 1) vm_page_free(m); } } /* * Release one wiring of the specified page, potentially allowing it to be * paged out. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue. If the released wiring * represented the last reference to the page, the page is freed. */ void vm_page_unwire(vm_page_t m, uint8_t nqueue) { KASSERT(nqueue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", nqueue, m)); if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); return; } vm_page_unwire_managed(m, nqueue, false); } /* * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases involving managed pages, vm_page_unwire() should be used * instead. */ bool vm_page_unwire_noq(vm_page_t m) { u_int old; old = vm_page_drop(m, 1); KASSERT(VPRC_WIRE_COUNT(old) != 0, ("%s: counter underflow for page %p", __func__, m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, ("%s: missing ref on fictitious page %p", __func__, m)); if (VPRC_WIRE_COUNT(old) > 1) return (false); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_clear(m, PGA_DEQUEUE); vm_wire_sub(1); return (true); } /* * Ensure that the page ends up in the specified page queue. If the page is * active or being moved to the active queue, ensure that its act_count is * at least ACT_INIT but do not otherwise mess with it. */ static __always_inline void vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) { vm_page_astate_t old, new; KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, ("%s: invalid flags %x", __func__, nflag)); if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; old = vm_page_astate_load(m); do { if ((old.flags & PGA_DEQUEUE) != 0) break; new = old; new.flags &= ~PGA_QUEUE_OP_MASK; if (nqueue == PQ_ACTIVE) new.act_count = max(old.act_count, ACT_INIT); if (old.queue == nqueue) { if (nqueue != PQ_ACTIVE) new.flags |= nflag; } else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Put the specified page on the active list (if appropriate). */ void vm_page_activate(vm_page_t m) { vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. */ void vm_page_deactivate(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); } void vm_page_deactivate_noreuse(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); } /* * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); } /* * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } /* * Release a page back to the page queues in preparation for unwiring. */ static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) { vm_page_astate_t old, new; uint16_t nflag; /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ if (noreuse || m->valid == 0) { nqueue = PQ_INACTIVE; nflag = PGA_REQUEUE_HEAD; } else { nflag = PGA_REQUEUE; } old = vm_page_astate_load(m); do { new = old; /* * If the page is already in the active queue and we are not * trying to accelerate reclamation, simply mark it as * referenced and avoid any queue operations. */ new.flags &= ~PGA_QUEUE_OP_MASK; if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) new.flags |= PGA_REFERENCED; else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); if ((flags & VPR_TRYFREE) != 0) { for (;;) { object = atomic_load_ptr(&m->object); if (object == NULL) break; /* Depends on type-stability. */ if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) break; if (object == m->object) { vm_page_release_locked(m, flags); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_WUNLOCK(object); } } vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && vm_page_tryxbusy(m)) { /* * An unlocked lookup may have wired the page before the * busy lock was acquired, in which case the page must * not be freed. */ if (__predict_true(!vm_page_wired(m))) { vm_page_free(m); return; } vm_page_xunbusy(m); } else { vm_page_release_toq(m, PQ_INACTIVE, flags != 0); } } } static bool vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) { u_int old; KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, ("vm_page_try_blocked_op: page %p has no object", m)); KASSERT(vm_page_busied(m), ("vm_page_try_blocked_op: page %p is not busy", m)); VM_OBJECT_ASSERT_LOCKED(m->object); old = m->ref_count; do { KASSERT(old != 0, ("vm_page_try_blocked_op: page %p has no references", m)); if (VPRC_WIRE_COUNT(old) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); (op)(m); /* * If the object is read-locked, new wirings may be created via an * object lookup. */ old = vm_page_drop(m, VPRC_BLOCKED); KASSERT(!VM_OBJECT_WOWNED(m->object) || old == (VPRC_BLOCKED | VPRC_OBJREF), ("vm_page_try_blocked_op: unexpected refcount value %u for %p", old, m)); return (true); } /* * Atomically check for wirings and remove all mappings of the page. */ bool vm_page_try_remove_all(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_all)); } /* * Atomically check for wirings and remove all writeable mappings of the page. */ bool vm_page_try_remove_write(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_write)); } /* * vm_page_advise * * Apply the specified advice to the given page. */ void vm_page_advise(vm_page_t m, int advice) { VM_OBJECT_ASSERT_WLOCKED(m->object); vm_page_assert_xbusied(m); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * vm_page_grab_release * * Helper routine for grab functions to release busy on return. */ static inline void vm_page_grab_release(vm_page_t m, int allocflags) { if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) vm_page_sunbusy(m); else vm_page_xunbusy(m); } } /* * vm_page_grab_sleep * * Sleep for busy according to VM_ALLOC_ parameters. Returns true * if the caller should retry and false otherwise. * * If the object is locked on entry the object will be unlocked with * false returns and still locked but possibly having been dropped * with true returns. */ static bool vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); /* * Reference the page before unlocking and sleeping so that * the page daemon is less likely to reclaim it. */ if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_reference(m); if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) && locked) VM_OBJECT_WLOCK(object); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); return (true); } /* * Assert that the grab flags are valid. */ static inline void vm_page_grab_check(int allocflags) { KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab*: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); } /* * Calculate the page allocation flags for grab. */ static inline int vm_page_grab_pflags(int allocflags) { int pflags; pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) pflags |= VM_ALLOC_SBUSY; return (pflags); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_grab_check(allocflags); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true)) goto retrylookup; return (NULL); } goto out; } if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags)); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); out: vm_page_grab_release(m, allocflags); return (m); } /* * Locklessly attempt to acquire a page given a (object, pindex) tuple * and an optional previous page to avoid the radix lookup. The resulting * page will be validated against the identity tuple and busied or wired * as requested. A NULL *mp return guarantees that the page was not in * radix at the time of the call but callers must perform higher level * synchronization or retry the operation under a lock if they require * an atomic answer. This is the only lock free validation routine, * other routines can depend on the resulting page state. * * The return value indicates whether the operation failed due to caller * flags. The return is tri-state with mp: * * (true, *mp != NULL) - The operation was successful. * (true, *mp == NULL) - The page was not found in tree. * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition. */ static bool vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t prev, vm_page_t *mp, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev)); *mp = NULL; for (;;) { /* * We may see a false NULL here because the previous page * has been removed or just inserted and the list is loaded * without barriers. Switch to radix to verify. */ if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL || QMD_IS_TRASHED(m) || m->pindex != pindex || atomic_load_ptr(&m->object) != object) { prev = NULL; /* * This guarantees the result is instantaneously * correct. */ m = vm_radix_lookup_unlocked(&object->rtree, pindex); } if (m == NULL) return (true); if (vm_page_trybusy(m, allocflags)) { if (m->object == object && m->pindex == pindex) break; /* relookup. */ vm_page_busy_release(m); cpu_spinwait(); continue; } if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", allocflags, false)) return (false); } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); *mp = m; return (true); } /* * Try to locklessly grab a page and fall back to the object lock if NOCREAT * is not set. */ vm_page_t vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags)) return (NULL); if (m != NULL) return (m); /* * The radix lockless lookup should never return a false negative * errors. If the user specifies NOCREAT they are guaranteed there * was no page present at the instant of the call. A NOCREAT caller * must handle create races gracefully. */ if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); VM_OBJECT_WLOCK(object); m = vm_page_grab(object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (m); } /* * Grab a page and make it valid, paging in if necessary. Pages missing from * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought * in simultaneously. Additional pages will be left on a paging queue but * will neither be wired nor busy regardless of allocflags. */ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_t ma[VM_INITIAL_PAGEIN]; int after, i, pflags, rv; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); VM_OBJECT_ASSERT_WLOCKED(object); pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY); pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { /* * If the page is fully valid it can only become invalid * with the object lock held. If it is not valid it can * become valid with the busy lock held. Therefore, we * may unnecessarily lock the exclusive busy here if we * race with I/O completion not using the object lock. * However, we will not end up with an invalid page and a * shared lock. */ if (!vm_page_trybusy(m, vm_page_all_valid(m) ? allocflags : 0)) { (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true); goto retrylookup; } if (vm_page_all_valid(m)) goto out; if ((allocflags & VM_ALLOC_NOCREAT) != 0) { vm_page_busy_release(m); *mp = NULL; return (VM_PAGER_FAIL); } } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) { goto retrylookup; } vm_page_assert_xbusied(m); if (vm_pager_has_page(object, pindex, NULL, &after)) { after = MIN(after, VM_INITIAL_PAGEIN); after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); after = MAX(after, 1); ma[0] = m; for (i = 1; i < after; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid || !vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, m->pindex + i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } after = i; vm_object_pip_add(object, after); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, ma, after, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeupn(object, after); /* Pager may have replaced a page. */ m = ma[0]; if (rv != VM_PAGER_OK) { for (i = 0; i < after; i++) { if (!vm_page_wired(ma[i])) vm_page_free(ma[i]); else vm_page_xunbusy(ma[i]); } *mp = NULL; return (rv); } for (i = 1; i < after; i++) vm_page_readahead_finish(ma[i]); MPASS(vm_page_all_valid(m)); } else { vm_page_zero_invalid(m, TRUE); } out: if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) vm_page_busy_downgrade(m); else if ((allocflags & VM_ALLOC_NOBUSY) != 0) vm_page_busy_release(m); *mp = m; return (VM_PAGER_OK); } /* * Locklessly grab a valid page. If the page is not valid or not yet * allocated this will fall back to the object lock method. */ int vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int flags; int error; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " "mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); /* * Attempt a lockless lookup and busy. We need at least an sbusy * before we can inspect the valid field and return a wired page. */ flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags)) return (VM_PAGER_FAIL); if ((m = *mp) != NULL) { if (vm_page_all_valid(m)) { if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); return (VM_PAGER_OK); } vm_page_busy_release(m); } if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } VM_OBJECT_WLOCK(object); error = vm_page_grab_valid(mp, object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (error); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT(count > 0, ("vm_page_grab_pages: invalid page count %d", count)); vm_page_grab_check(allocflags); pflags = vm_page_grab_pflags(allocflags); i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex + i, "grbmaw", allocflags, true)) goto retrylookup; break; } } else { if ((allocflags & VM_ALLOC_NOCREAT) != 0) break; m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) break; goto retrylookup; } } if (vm_page_none_valid(m) && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } vm_page_grab_release(m, allocflags); ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Unlocked variant of vm_page_grab_pages(). This accepts the same flags * and will fall back to the locked variant to handle allocation. */ int vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, pred; int flags; int i; KASSERT(count > 0, ("vm_page_grab_pages_unlocked: invalid page count %d", count)); vm_page_grab_check(allocflags); /* * Modify flags for lockless acquire to hold the page until we * set it valid if necessary. */ flags = allocflags & ~VM_ALLOC_NOBUSY; pred = NULL; for (i = 0; i < count; i++, pindex++) { if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags)) return (i); if (m == NULL) break; if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } /* m will still be wired or busy according to flags. */ vm_page_grab_release(m, allocflags); pred = ma[i] = m; } if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) return (i); count -= i; VM_OBJECT_WLOCK(object); i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); VM_OBJECT_WUNLOCK(object); return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } void vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) { #if PAGE_SIZE == 32768 atomic_set_64((uint64_t *)bits, set); #elif PAGE_SIZE == 16384 atomic_set_32((uint32_t *)bits, set); #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) atomic_set_16((uint16_t *)bits, set); #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) atomic_set_8((uint8_t *)bits, set); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_set_32((uint32_t *)addr, set << shift); #endif /* PAGE_SIZE */ } static inline void vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) { #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)bits, clear); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)bits, clear); #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) atomic_clear_16((uint16_t *)bits, clear); #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) atomic_clear_8((uint8_t *)bits, clear); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, clear << shift); #endif /* PAGE_SIZE */ } static inline vm_page_bits_t vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) { #if PAGE_SIZE == 32768 uint64_t old; old = *bits; while (atomic_fcmpset_64(bits, &old, newbits) == 0); return (old); #elif PAGE_SIZE == 16384 uint32_t old; old = *bits; while (atomic_fcmpset_32(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) uint16_t old; old = *bits; while (atomic_fcmpset_16(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) uint8_t old; old = *bits; while (atomic_fcmpset_8(bits, &old, newbits) == 0); return (old); #else /* PAGE_SIZE <= 4096*/ uintptr_t addr; uint32_t old, new, mask; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, swap, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); mask = VM_PAGE_BITS_ALL << shift; old = *bits; do { new = old & ~mask; new |= newbits << shift; } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); return (old >> shift); #endif /* PAGE_SIZE */ } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; vm_page_bits_t pagebits; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); } /* * Set the page dirty bits and free the invalid swap space if * present. Returns the previous dirty bits. */ vm_page_bits_t vm_page_set_dirty(vm_page_t m) { vm_page_bits_t old; VM_PAGE_OBJECT_BUSY_ASSERT(m); if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { old = m->dirty; m->dirty = VM_PAGE_BITS_ALL; } else old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) vm_pager_page_unswapped(m); return (old); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { vm_page_assert_busied(m); /* * If the page is xbusied and not write mapped we are the * only thread that can modify dirty bits. Otherwise, The pmap * layer can call vm_page_dirty() without holding a distinguished * lock. The combination of page busy and atomic operations * suffice to guarantee consistency of the page dirty field. */ if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else vm_page_bits_clear(m, &m->dirty, pagebits); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PGA_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; vm_page_aflag_clear(m, PGA_NOSYNC); } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; /* * The object lock is required so that pages can't be mapped * read-only while we're in the process of invalidating them. */ object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_busied(m); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && vm_page_all_valid(m)) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); if (vm_page_xbusied(m)) { m->valid &= ~bits; m->dirty &= ~bits; } else { vm_page_bits_clear(m, &m->valid, bits); vm_page_bits_clear(m, &m->dirty, bits); } } /* * vm_page_invalid: * * Invalidates the entire page. The page must be busy, unmapped, and * the enclosing object must be locked. The object locks protects * against concurrent read-only pmap enter which is done without * busy. */ void vm_page_invalid(vm_page_t m) { vm_page_assert_busied(m); VM_OBJECT_ASSERT_WLOCKED(m->object); MPASS(!pmap_page_is_mapped(m)); if (vm_page_xbusied(m)) m->valid = 0; else vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistency * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) vm_page_valid(m); } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * Some callers envoke this routine without the busy lock held and * handle races via higher level locks. Typical callers should * hold a busy lock to prevent invalidation. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { vm_page_assert_busied(m); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_valid(vm_page_t m) { vm_page_assert_busied(m); if (vm_page_xbusied(m)) m->valid = VM_PAGE_BITS_ALL; else vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of a page or object busy. */ if (m->object != NULL && !vm_page_busied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include -DB_SHOW_COMMAND(page, vm_page_print_page_info) +DB_SHOW_COMMAND_FLAGS(page, vm_page_print_page_info, DB_CMD_MEMSAFE) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } -DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) +DB_SHOW_COMMAND_FLAGS(pageq, vm_page_print_pageq_info, DB_CMD_MEMSAFE) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->a.queue, m->ref_count, m->a.flags, m->oflags, m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 45c624fdd451..856cd3ec83b8 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -1,1894 +1,1894 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Physical memory system implementation * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); #ifdef NUMA struct mem_affinity __read_mostly *mem_affinity; int __read_mostly *mem_locality; #endif int __read_mostly vm_ndomains = 1; domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1); struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX]; int __read_mostly vm_phys_nsegs; static struct vm_phys_seg vm_phys_early_segs[8]; static int vm_phys_early_nsegs; struct vm_phys_fictitious_seg; static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, struct vm_phys_fictitious_seg *); RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = RB_INITIALIZER(&vm_phys_fictitious_tree); struct vm_phys_fictitious_seg { RB_ENTRY(vm_phys_fictitious_seg) node; /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; }; RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, vm_phys_fictitious_cmp); static struct rwlock_padalign vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist __aligned(CACHE_LINE_SIZE) vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL] [VM_NFREEORDER_MAX]; static int __read_mostly vm_nfreelists; /* * These "avail lists" are globals used to communicate boot-time physical * memory layout to other parts of the kernel. Each physically contiguous * region of memory is defined by a start address at an even index and an * end address at the following odd index. Each list is terminated by a * pair of zero entries. * * dump_avail tells the dump code what regions to include in a crash dump, and * phys_avail is all of the remaining physical memory that is available for * the vm system. * * Initially dump_avail and phys_avail are identical. Boot time memory * allocations remove extents from phys_avail that may still be included * in dumps. */ vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; CTASSERT(VM_FREELIST_DEFAULT == 0); #ifdef VM_FREELIST_DMA32 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) #endif /* * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about * the ordering of the free list boundaries. */ #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); #endif static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); #ifdef NUMA static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); #endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, int tail); /* * Red-black tree helpers for vm fictitious range management. */ static inline int vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, struct vm_phys_fictitious_seg *range) { KASSERT(range->start != 0 && range->end != 0, ("Invalid range passed on search for vm_fictitious page")); if (p->start >= range->end) return (1); if (p->start < range->start) return (-1); return (0); } static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, struct vm_phys_fictitious_seg *p2) { /* Check if this is a search for a page */ if (p1->end == 0) return (vm_phys_fictitious_in_range(p1, p2)); KASSERT(p2->end != 0, ("Invalid range passed as second parameter to vm fictitious comparison")); /* Searching to add a new range */ if (p1->end <= p2->start) return (-1); if (p1->start >= p2->end) return (1); panic("Trying to add overlapping vm fictitious ranges:\n" "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); } int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high) { #ifdef NUMA domainset_t mask; int i; if (vm_ndomains == 1 || mem_affinity == NULL) return (0); DOMAINSET_ZERO(&mask); /* * Check for any memory that overlaps low, high. */ for (i = 0; mem_affinity[i].end != 0; i++) if (mem_affinity[i].start <= high && mem_affinity[i].end >= low) DOMAINSET_SET(mem_affinity[i].domain, &mask); if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask)) return (prefer); if (DOMAINSET_EMPTY(&mask)) panic("vm_phys_domain_match: Impossible constraint"); return (DOMAINSET_FFS(&mask) - 1); #else return (0); #endif } /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. */ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_freelist *fl; int dom, error, flind, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); for (dom = 0; dom < vm_ndomains; dom++) { sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { sbuf_printf(&sbuf, "\nFREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, " | POOL %d", pind); sbuf_printf(&sbuf, "\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, "-- -- "); sbuf_printf(&sbuf, "--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); } sbuf_printf(&sbuf, "\n"); } } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Outputs the set of physical memory segments. */ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_phys_seg *seg; int error, segind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); for (segind = 0; segind < vm_phys_nsegs; segind++) { sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); seg = &vm_phys_segs[segind]; sbuf_printf(&sbuf, "start: %#jx\n", (uintmax_t)seg->start); sbuf_printf(&sbuf, "end: %#jx\n", (uintmax_t)seg->end); sbuf_printf(&sbuf, "domain: %d\n", seg->domain); sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Return affinity, or -1 if there's no affinity information. */ int vm_phys_mem_affinity(int f, int t) { #ifdef NUMA if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); #else return (-1); #endif } #ifdef NUMA /* * Outputs the VM locality table. */ static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; int error, i, j; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n"); for (i = 0; i < vm_ndomains; i++) { sbuf_printf(&sbuf, "%d: ", i); for (j = 0; j < vm_ndomains; j++) { sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); } sbuf_printf(&sbuf, "\n"); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } #endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { m->order = order; if (tail) TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); else TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); fl[order].lcnt++; } static void vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { TAILQ_REMOVE(&fl[order].pl, m, listq); fl[order].lcnt--; m->order = VM_NFREEORDER; } /* * Create a physical memory segment. */ static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) { struct vm_phys_seg *seg; KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); KASSERT(domain >= 0 && domain < vm_ndomains, ("vm_phys_create_seg: invalid domain provided")); seg = &vm_phys_segs[vm_phys_nsegs++]; while (seg > vm_phys_segs && (seg - 1)->start >= end) { *seg = *(seg - 1); seg--; } seg->start = start; seg->end = end; seg->domain = domain; } static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { #ifdef NUMA int i; if (mem_affinity == NULL) { _vm_phys_create_seg(start, end, 0); return; } for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); if (mem_affinity[i].end <= start) continue; if (mem_affinity[i].start > start) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { _vm_phys_create_seg(start, end, mem_affinity[i].domain); break; } _vm_phys_create_seg(start, mem_affinity[i].end, mem_affinity[i].domain); start = mem_affinity[i].end; } #else _vm_phys_create_seg(start, end, 0); #endif } /* * Add a physical memory segment. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t paddr; KASSERT((start & PAGE_MASK) == 0, ("vm_phys_define_seg: start is not page aligned")); KASSERT((end & PAGE_MASK) == 0, ("vm_phys_define_seg: end is not page aligned")); /* * Split the physical memory segment if it spans two or more free * list boundaries. */ paddr = start; #ifdef VM_FREELIST_LOWMEM if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); paddr = VM_LOWMEM_BOUNDARY; } #endif #ifdef VM_FREELIST_DMA32 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); paddr = VM_DMA32_BOUNDARY; } #endif vm_phys_create_seg(paddr, end); } /* * Initialize the physical memory allocator. * * Requires that vm_page_array is initialized! */ void vm_phys_init(void) { struct vm_freelist *fl; struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg; #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE) u_long npages; #endif int dom, flind, freelist, oind, pind, segind; /* * Compute the number of free lists, and generate the mapping from the * manifest constants VM_FREELIST_* to the free list indices. * * Initially, the entries of vm_freelist_to_flind[] are set to either * 0 or 1 to indicate which free lists should be created. */ #ifdef VM_DMA32_NPAGES_THRESHOLD npages = 0; #endif for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; else #endif #ifdef VM_FREELIST_DMA32 if ( #ifdef VM_DMA32_NPAGES_THRESHOLD /* * Create the DMA32 free list only if the amount of * physical memory above physical address 4G exceeds the * given threshold. */ npages > VM_DMA32_NPAGES_THRESHOLD && #endif seg->end <= VM_DMA32_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; else #endif { #ifdef VM_DMA32_NPAGES_THRESHOLD npages += atop(seg->end - seg->start); #endif vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; } } /* Change each entry into a running total of the free lists. */ for (freelist = 1; freelist < VM_NFREELIST; freelist++) { vm_freelist_to_flind[freelist] += vm_freelist_to_flind[freelist - 1]; } vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); /* Change each entry into a free list index. */ for (freelist = 0; freelist < VM_NFREELIST; freelist++) vm_freelist_to_flind[freelist]--; /* * Initialize the first_page and free_queues fields of each physical * memory segment. */ #ifdef VM_PHYSSEG_SPARSE npages = 0; #endif for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; #ifdef VM_PHYSSEG_SPARSE seg->first_page = &vm_page_array[npages]; npages += atop(seg->end - seg->start); #else seg->first_page = PHYS_TO_VM_PAGE(seg->start); #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; KASSERT(flind >= 0, ("vm_phys_init: LOWMEM flind < 0")); } else #endif #ifdef VM_FREELIST_DMA32 if (seg->end <= VM_DMA32_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; KASSERT(flind >= 0, ("vm_phys_init: DMA32 flind < 0")); } else #endif { flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; KASSERT(flind >= 0, ("vm_phys_init: DEFAULT flind < 0")); } seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; } /* * Coalesce physical memory segments that are contiguous and share the * same per-domain free queues. */ prev_seg = vm_phys_segs; seg = &vm_phys_segs[1]; end_seg = &vm_phys_segs[vm_phys_nsegs]; while (seg < end_seg) { if (prev_seg->end == seg->start && prev_seg->free_queues == seg->free_queues) { prev_seg->end = seg->end; KASSERT(prev_seg->domain == seg->domain, ("vm_phys_init: free queues cannot span domains")); vm_phys_nsegs--; end_seg--; for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++) *tmp_seg = *(tmp_seg + 1); } else { prev_seg = seg; seg++; } } /* * Initialize the free queues. */ for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; for (oind = 0; oind < VM_NFREEORDER; oind++) TAILQ_INIT(&fl[oind].pl); } } } rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* * Register info about the NUMA topology of the system. * * Invoked by platform-dependent code prior to vm_phys_init(). */ void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, int *locality) { #ifdef NUMA int d, i; /* * For now the only override value that we support is 1, which * effectively disables NUMA-awareness in the allocators. */ d = 0; TUNABLE_INT_FETCH("vm.numa.disabled", &d); if (d) ndomains = 1; if (ndomains > 1) { vm_ndomains = ndomains; mem_affinity = affinity; mem_locality = locality; } for (i = 0; i < vm_ndomains; i++) DOMAINSET_SET(i, &all_domains); #else (void)ndomains; (void)affinity; (void)locality; #endif } /* * Split a contiguous, power of two-sized set of physical pages. * * When this function is called by a page allocation function, the caller * should request insertion at the head unless the order [order, oind) queues * are known to be empty. The objective being to reduce the likelihood of * long-term fragmentation by promoting contemporaneous allocation and * (hopefully) deallocation. */ static __inline void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, int tail) { vm_page_t m_buddy; while (oind > order) { oind--; m_buddy = &m[1 << oind]; KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); vm_freelist_add(fl, m_buddy, oind, tail); } } /* * Add the physical pages [m, m + npages) at the end of a power-of-two aligned * and sized set to the specified free list. * * When this function is called by a page allocation function, the caller * should request insertion at the head unless the lower-order queues are * known to be empty. The objective being to reduce the likelihood of long- * term fragmentation by promoting contemporaneous allocation and (hopefully) * deallocation. * * The physical page m's buddy must not be free. */ static void vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail) { u_int n; int order; KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0")); KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) & ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0, ("vm_phys_enq_range: page %p and npages %u are misaligned", m, npages)); do { KASSERT(m->order == VM_NFREEORDER, ("vm_phys_enq_range: page %p has unexpected order %d", m, m->order)); order = ffs(npages) - 1; KASSERT(order < VM_NFREEORDER, ("vm_phys_enq_range: order %d is out of range", order)); vm_freelist_add(fl, m, order, tail); n = 1 << order; m += n; npages -= n; } while (npages > 0); } /* * Set the pool for a contiguous, power of two-sized set of physical pages. */ static void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) m_tmp->pool = pool; } /* * Tries to allocate the specified number of pages from the specified pool * within the specified domain. Returns the actual number of allocated pages * and a pointer to each page through the array ma[]. * * The returned pages may not be physically contiguous. However, in contrast * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0), * calling this function once to allocate the desired number of pages will * avoid wasted time in vm_phys_split_pages(). * * The free page queues for the specified domain must be locked. */ int vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]) { struct vm_freelist *alt, *fl; vm_page_t m; int avail, end, flind, freelist, i, need, oind, pind; KASSERT(domain >= 0 && domain < vm_ndomains, ("vm_phys_alloc_npages: domain %d is out of range", domain)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_npages: pool %d is out of range", pool)); KASSERT(npages <= 1 << (VM_NFREEORDER - 1), ("vm_phys_alloc_npages: npages %d is out of range", npages)); vm_domain_free_assert_locked(VM_DOMAIN(domain)); i = 0; for (freelist = 0; freelist < VM_NFREELIST; freelist++) { flind = vm_freelist_to_flind[freelist]; if (flind < 0) continue; fl = vm_phys_free_queues[domain][flind][pool]; for (oind = 0; oind < VM_NFREEORDER; oind++) { while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { vm_freelist_rem(fl, m, oind); avail = 1 << oind; need = imin(npages - i, avail); for (end = i + need; i < end;) ma[i++] = m++; if (need < avail) { /* * Return excess pages to fl. Its * order [0, oind) queues are empty. */ vm_phys_enq_range(m, avail - need, fl, 1); return (npages); } else if (i == npages) return (npages); } } for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = vm_phys_free_queues[domain][flind][pind]; while ((m = TAILQ_FIRST(&alt[oind].pl)) != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); avail = 1 << oind; need = imin(npages - i, avail); for (end = i + need; i < end;) ma[i++] = m++; if (need < avail) { /* * Return excess pages to fl. * Its order [0, oind) queues * are empty. */ vm_phys_enq_range(m, avail - need, fl, 1); return (npages); } else if (i == npages) return (npages); } } } } return (i); } /* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_pages(int domain, int pool, int order) { vm_page_t m; int freelist; for (freelist = 0; freelist < VM_NFREELIST; freelist++) { m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order); if (m != NULL) return (m); } return (NULL); } /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the * manifest constants VM_FREELIST_*. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order) { struct vm_freelist *alt, *fl; vm_page_t m; int oind, pind, flind; KASSERT(domain >= 0 && domain < vm_ndomains, ("vm_phys_alloc_freelist_pages: domain %d is out of range", domain)); KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); flind = vm_freelist_to_flind[freelist]; /* Check if freelist is present */ if (flind < 0) return (NULL); vm_domain_free_assert_locked(VM_DOMAIN(domain)); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { vm_freelist_rem(fl, m, oind); /* The order [order, oind) queues are empty. */ vm_phys_split_pages(m, oind, fl, order, 1); return (m); } } /* * The given pool was empty. Find the largest * contiguous, power-of-two-sized set of pages in any * pool. Transfer these pages to the given pool, and * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); /* The order [order, oind) queues are empty. */ vm_phys_split_pages(m, oind, fl, order, 1); return (m); } } } return (NULL); } /* * Find the vm_page corresponding to the given physical address. */ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (&seg->first_page[atop(pa - seg->start)]); } return (NULL); } vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; m = NULL; tmp.start = pa; tmp.end = 0; rw_rlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); rw_runlock(&vm_phys_fictitious_reg_lock); if (seg == NULL) return (NULL); m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); return (m); } static inline void vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, long page_count, vm_memattr_t memattr) { long i; bzero(range, page_count * sizeof(*range)); for (i = 0; i < page_count; i++) { vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); range[i].oflags &= ~VPO_UNMANAGED; range[i].busy_lock = VPB_UNBUSIED; } } int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) { struct vm_phys_fictitious_seg *seg; vm_page_t fp; long page_count; #ifdef VM_PHYSSEG_DENSE long pi, pe; long dpage_count; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; if ((pe - first_page) > vm_page_array_size) { /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Use vm_page_array pages for those that are * inside of the vm_page_array range, and * allocate the remaining ones. */ dpage_count = vm_page_array_size - (pi - first_page); vm_phys_fictitious_init_range(fp, start, dpage_count, memattr); page_count -= dpage_count; start += ptoa(dpage_count); goto alloc; } /* * We can allocate the full range from vm_page_array, * so there's no need to register the range in the tree. */ vm_phys_fictitious_init_range(fp, start, page_count, memattr); return (0); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ fp = &vm_page_array[0]; dpage_count = pe - first_page; vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, memattr); end -= ptoa(dpage_count); page_count -= dpage_count; goto alloc; } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* * Trying to register a fictitious range that expands before * and after vm_page_array. */ return (EINVAL); } else { alloc: #endif fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, M_WAITOK); #ifdef VM_PHYSSEG_DENSE } #endif vm_phys_fictitious_init_range(fp, start, page_count, memattr); seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); seg->start = start; seg->end = end; seg->first_page = fp; rw_wlock(&vm_phys_fictitious_reg_lock); RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE long pi, pe; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { if ((pe - first_page) <= vm_page_array_size) { /* * This segment was allocated using vm_page_array * only, there's nothing to do since those pages * were never added to the tree. */ return; } /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Calculate how many pages were added to the * tree and free them. */ start = ptoa(first_page + vm_page_array_size); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ end = ptoa(first_page); } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* Since it's not possible to register such a range, panic. */ panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } #endif tmp.start = start; tmp.end = 0; rw_wlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); if (seg->start != start || seg->end != end) { rw_wunlock(&vm_phys_fictitious_reg_lock); panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); free(seg->first_page, M_FICT_PAGES); free(seg, M_FICT_PAGES); } /* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_pages(vm_page_t m, int order) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa; vm_page_t m_buddy; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); KASSERT(m->pool < VM_NFREEPOOL, ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); seg = &vm_phys_segs[m->segind]; vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; m_buddy = &seg->first_page[atop(pa - seg->start)]; if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; vm_freelist_add(fl, m, order, 1); } /* * Return the largest possible order of a set of pages starting at m. */ static int max_order(vm_page_t m) { /* * Unsigned "min" is used here so that "order" is assigned * "VM_NFREEORDER - 1" when "m"'s physical address is zero * or the low-order bits of its physical address are zero * because the size of a physical address exceeds the size of * a long. */ return (min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, VM_NFREEORDER - 1)); } /* * Free a contiguous, arbitrarily sized set of physical pages, without * merging across set boundaries. * * The free page queues must be locked. */ void vm_phys_enqueue_contig(vm_page_t m, u_long npages) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_page_t m_end; int order; /* * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ vm_domain_free_assert_locked(vm_pagequeue_domain(m)); seg = &vm_phys_segs[m->segind]; fl = (*seg->free_queues)[m->pool]; m_end = m + npages; /* Free blocks of increasing size. */ while ((order = max_order(m)) < VM_NFREEORDER - 1 && m + (1 << order) <= m_end) { KASSERT(seg == &vm_phys_segs[m->segind], ("%s: page range [%p,%p) spans multiple segments", __func__, m_end - npages, m)); vm_freelist_add(fl, m, order, 1); m += 1 << order; } /* Free blocks of maximum size. */ while (m + (1 << order) <= m_end) { KASSERT(seg == &vm_phys_segs[m->segind], ("%s: page range [%p,%p) spans multiple segments", __func__, m_end - npages, m)); vm_freelist_add(fl, m, order, 1); m += 1 << order; } /* Free blocks of diminishing size. */ while (m < m_end) { KASSERT(seg == &vm_phys_segs[m->segind], ("%s: page range [%p,%p) spans multiple segments", __func__, m_end - npages, m)); order = flsl(m_end - m) - 1; vm_freelist_add(fl, m, order, 1); m += 1 << order; } } /* * Free a contiguous, arbitrarily sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_contig(vm_page_t m, u_long npages) { int order_start, order_end; vm_page_t m_start, m_end; vm_domain_free_assert_locked(vm_pagequeue_domain(m)); m_start = m; order_start = max_order(m_start); if (order_start < VM_NFREEORDER - 1) m_start += 1 << order_start; m_end = m + npages; order_end = max_order(m_end); if (order_end < VM_NFREEORDER - 1) m_end -= 1 << order_end; /* * Avoid unnecessary coalescing by freeing the pages at the start and * end of the range last. */ if (m_start < m_end) vm_phys_enqueue_contig(m_start, m_end - m_start); if (order_start < VM_NFREEORDER - 1) vm_phys_free_pages(m, order_start); if (order_end < VM_NFREEORDER - 1) vm_phys_free_pages(m_end, order_end); } /* * Scan physical memory between the specified addresses "low" and "high" for a * run of contiguous physical pages that satisfy the specified conditions, and * return the lowest page in the run. The specified "alignment" determines * the alignment of the lowest physical page in the run. If the specified * "boundary" is non-zero, then the run of physical pages cannot span a * physical address that is a multiple of "boundary". * * "npages" must be greater than zero. Both "alignment" and "boundary" must * be a power of two. */ vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options) { vm_paddr_t pa_end; vm_page_t m_end, m_run, m_start; struct vm_phys_seg *seg; int segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); if (low >= high) return (NULL); for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (seg->domain != domain) continue; if (seg->start >= high) break; if (low >= seg->end) continue; if (low <= seg->start) m_start = seg->first_page; else m_start = &seg->first_page[atop(low - seg->start)]; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) continue; m_end = &seg->first_page[atop(pa_end - seg->start)]; m_run = vm_page_scan_contig(npages, m_start, m_end, alignment, boundary, options); if (m_run != NULL) return (m_run); } return (NULL); } /* * Search for the given physical page "m" in the free lists. If the search * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa, pa_half; vm_page_t m_set, m_tmp; int order; /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) m_set = &seg->first_page[atop(pa - seg->start)]; else return (FALSE); } if (m_set->order < order) return (FALSE); if (m_set->order == VM_NFREEORDER) return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); /* * Next, remove "m_set" from the free lists. Finally, extract * "m" from "m_set" using an iterative algorithm: While "m_set" * is larger than a page, shrink "m_set" by returning the half * of "m_set" that does not contain "m" to the free lists. */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) m_tmp = &seg->first_page[atop(pa_half - seg->start)]; else { m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); } /* * Find a run of contiguous physical pages from the specified page list. */ static vm_page_t vm_phys_find_freelist_contig(struct vm_freelist *fl, int oind, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_phys_seg *seg; vm_paddr_t frag, lbound, pa, page_size, pa_end, pa_pre, size; vm_page_t m, m_listed, m_ret; int order; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); /* Search for a run satisfying the specified conditions. */ page_size = PAGE_SIZE; size = npages << PAGE_SHIFT; frag = (npages & ~(~0UL << oind)) << PAGE_SHIFT; TAILQ_FOREACH(m_listed, &fl[oind].pl, listq) { /* * Determine if the address range starting at pa is * too low. */ pa = VM_PAGE_TO_PHYS(m_listed); if (pa < low) continue; /* * If this is not the first free oind-block in this range, bail * out. We have seen the first free block already, or will see * it before failing to find an appropriate range. */ seg = &vm_phys_segs[m_listed->segind]; lbound = low > seg->start ? low : seg->start; pa_pre = pa - (page_size << oind); m = &seg->first_page[atop(pa_pre - seg->start)]; if (pa != 0 && pa_pre >= lbound && m->order == oind) continue; if (!vm_addr_align_ok(pa, alignment)) /* Advance to satisfy alignment condition. */ pa = roundup2(pa, alignment); else if (frag != 0 && lbound + frag <= pa) { /* * Back up to the first aligned free block in this * range, without moving below lbound. */ pa_end = pa; for (order = oind - 1; order >= 0; order--) { pa_pre = pa_end - (page_size << order); if (!vm_addr_align_ok(pa_pre, alignment)) break; m = &seg->first_page[atop(pa_pre - seg->start)]; if (pa_pre >= lbound && m->order == order) pa_end = pa_pre; } /* * If the extra small blocks are enough to complete the * fragment, use them. Otherwise, look to allocate the * fragment at the other end. */ if (pa_end + frag <= pa) pa = pa_end; } /* Advance as necessary to satisfy boundary conditions. */ if (!vm_addr_bound_ok(pa, size, boundary)) pa = roundup2(pa + 1, boundary); pa_end = pa + size; /* * Determine if the address range is valid (without overflow in * pa_end calculation), and fits within the segment. */ if (pa_end < pa || seg->end < pa_end) continue; m_ret = &seg->first_page[atop(pa - seg->start)]; /* * Determine whether there are enough free oind-blocks here to * satisfy the allocation request. */ pa = VM_PAGE_TO_PHYS(m_listed); do { pa += page_size << oind; if (pa >= pa_end) return (m_ret); m = &seg->first_page[atop(pa - seg->start)]; } while (oind == m->order); /* * Determine if an additional series of free blocks of * diminishing size can help to satisfy the allocation request. */ while (m->order < oind && pa + 2 * (page_size << m->order) > pa_end) { pa += page_size << m->order; if (pa >= pa_end) return (m_ret); m = &seg->first_page[atop(pa - seg->start)]; } } return (NULL); } /* * Find a run of contiguous physical pages from the specified free list * table. */ static vm_page_t vm_phys_find_queues_contig( struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX], u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_freelist *fl; vm_page_t m_ret; vm_paddr_t pa, pa_end, size; int oind, order, pind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); /* Compute the queue that is the best fit for npages. */ order = flsl(npages - 1); /* Search for a large enough free block. */ size = npages << PAGE_SHIFT; for (oind = order; oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { /* * Determine if the address range starting at pa * is within the given range, satisfies the * given alignment, and does not cross the given * boundary. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; if (low <= pa && pa_end <= high && vm_addr_ok(pa, size, alignment, boundary)) return (m_ret); } } } if (order < VM_NFREEORDER) return (NULL); /* Search for a long-enough sequence of small blocks. */ oind = VM_NFREEORDER - 1; for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; m_ret = vm_phys_find_freelist_contig(fl, oind, npages, low, high, alignment, boundary); if (m_ret != NULL) return (m_ret); } return (NULL); } /* * Allocate a contiguous set of physical pages of the given size * "npages" from the free lists. All of the physical pages must be at * or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross * any physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa_end, pa_start; struct vm_freelist *fl; vm_page_t m, m_run; struct vm_phys_seg *seg; struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; int oind, segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); vm_domain_free_assert_locked(VM_DOMAIN(domain)); if (low >= high) return (NULL); queues = NULL; m_run = NULL; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; if (seg->start >= high || seg->domain != domain) continue; if (low >= seg->end) break; if (low <= seg->start) pa_start = seg->start; else pa_start = low; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - pa_start < ptoa(npages)) continue; /* * If a previous segment led to a search using * the same free lists as would this segment, then * we've actually already searched within this * too. So skip it. */ if (seg->free_queues == queues) continue; queues = seg->free_queues; m_run = vm_phys_find_queues_contig(queues, npages, low, high, alignment, boundary); if (m_run != NULL) break; } if (m_run == NULL) return (NULL); /* Allocate pages from the page-range found. */ for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) { fl = (*queues)[m->pool]; oind = m->order; vm_freelist_rem(fl, m, oind); if (m->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); } /* Return excess pages to the free lists. */ if (&m_run[npages] < m) { fl = (*queues)[VM_FREEPOOL_DEFAULT]; vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl, 0); } return (m_run); } /* * Return the index of the first unused slot which may be the terminating * entry. */ static int vm_phys_avail_count(void) { int i; for (i = 0; phys_avail[i + 1]; i += 2) continue; if (i > PHYS_AVAIL_ENTRIES) panic("Improperly terminated phys_avail %d entries", i); return (i); } /* * Assert that a phys_avail entry is valid. */ static void vm_phys_avail_check(int i) { if (phys_avail[i] & PAGE_MASK) panic("Unaligned phys_avail[%d]: %#jx", i, (intmax_t)phys_avail[i]); if (phys_avail[i+1] & PAGE_MASK) panic("Unaligned phys_avail[%d + 1]: %#jx", i, (intmax_t)phys_avail[i]); if (phys_avail[i + 1] < phys_avail[i]) panic("phys_avail[%d] start %#jx < end %#jx", i, (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]); } /* * Return the index of an overlapping phys_avail entry or -1. */ #ifdef NUMA static int vm_phys_avail_find(vm_paddr_t pa) { int i; for (i = 0; phys_avail[i + 1]; i += 2) if (phys_avail[i] <= pa && phys_avail[i + 1] > pa) return (i); return (-1); } #endif /* * Return the index of the largest entry. */ int vm_phys_avail_largest(void) { vm_paddr_t sz, largesz; int largest; int i; largest = 0; largesz = 0; for (i = 0; phys_avail[i + 1]; i += 2) { sz = vm_phys_avail_size(i); if (sz > largesz) { largesz = sz; largest = i; } } return (largest); } vm_paddr_t vm_phys_avail_size(int i) { return (phys_avail[i + 1] - phys_avail[i]); } /* * Split an entry at the address 'pa'. Return zero on success or errno. */ static int vm_phys_avail_split(vm_paddr_t pa, int i) { int cnt; vm_phys_avail_check(i); if (pa <= phys_avail[i] || pa >= phys_avail[i + 1]) panic("vm_phys_avail_split: invalid address"); cnt = vm_phys_avail_count(); if (cnt >= PHYS_AVAIL_ENTRIES) return (ENOSPC); memmove(&phys_avail[i + 2], &phys_avail[i], (cnt - i) * sizeof(phys_avail[0])); phys_avail[i + 1] = pa; phys_avail[i + 2] = pa; vm_phys_avail_check(i); vm_phys_avail_check(i+2); return (0); } /* * Check if a given physical address can be included as part of a crash dump. */ bool vm_phys_is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (true); } return (false); } void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_seg *seg; if (vm_phys_early_nsegs == -1) panic("%s: called after initialization", __func__); if (vm_phys_early_nsegs == nitems(vm_phys_early_segs)) panic("%s: ran out of early segments", __func__); seg = &vm_phys_early_segs[vm_phys_early_nsegs++]; seg->start = start; seg->end = end; } /* * This routine allocates NUMA node specific memory before the page * allocator is bootstrapped. */ vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size) { #ifdef NUMA int mem_index; #endif int i, biggestone; vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align; KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains), ("%s: invalid domain index %d", __func__, domain)); /* * Search the mem_affinity array for the biggest address * range in the desired domain. This is used to constrain * the phys_avail selection below. */ biggestsize = 0; mem_start = 0; mem_end = -1; #ifdef NUMA mem_index = 0; if (mem_affinity != NULL) { for (i = 0;; i++) { size = mem_affinity[i].end - mem_affinity[i].start; if (size == 0) break; if (domain != -1 && mem_affinity[i].domain != domain) continue; if (size > biggestsize) { mem_index = i; biggestsize = size; } } mem_start = mem_affinity[mem_index].start; mem_end = mem_affinity[mem_index].end; } #endif /* * Now find biggest physical segment in within the desired * numa domain. */ biggestsize = 0; biggestone = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* skip regions that are out of range */ if (phys_avail[i+1] - alloc_size < mem_start || phys_avail[i+1] > mem_end) continue; size = vm_phys_avail_size(i); if (size > biggestsize) { biggestone = i; biggestsize = size; } } alloc_size = round_page(alloc_size); /* * Grab single pages from the front to reduce fragmentation. */ if (alloc_size == PAGE_SIZE) { pa = phys_avail[biggestone]; phys_avail[biggestone] += PAGE_SIZE; vm_phys_avail_check(biggestone); return (pa); } /* * Naturally align large allocations. */ align = phys_avail[biggestone + 1] & (alloc_size - 1); if (alloc_size + align > biggestsize) panic("cannot find a large enough size\n"); if (align != 0 && vm_phys_avail_split(phys_avail[biggestone + 1] - align, biggestone) != 0) /* Wasting memory. */ phys_avail[biggestone + 1] -= align; phys_avail[biggestone + 1] -= alloc_size; vm_phys_avail_check(biggestone); pa = phys_avail[biggestone + 1]; return (pa); } void vm_phys_early_startup(void) { struct vm_phys_seg *seg; int i; for (i = 0; phys_avail[i + 1] != 0; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; i < vm_phys_early_nsegs; i++) { seg = &vm_phys_early_segs[i]; vm_phys_add_seg(seg->start, seg->end); } vm_phys_early_nsegs = -1; #ifdef NUMA /* Force phys_avail to be split by domain. */ if (mem_affinity != NULL) { int idx; for (i = 0; mem_affinity[i].end != 0; i++) { idx = vm_phys_avail_find(mem_affinity[i].start); if (idx != -1 && phys_avail[idx] != mem_affinity[i].start) vm_phys_avail_split(mem_affinity[i].start, idx); idx = vm_phys_avail_find(mem_affinity[i].end); if (idx != -1 && phys_avail[idx] != mem_affinity[i].end) vm_phys_avail_split(mem_affinity[i].end, idx); } } #endif } #ifdef DDB /* * Show the number of physical pages in each of the free lists. */ -DB_SHOW_COMMAND(freepages, db_show_freepages) +DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE) { struct vm_freelist *fl; int flind, oind, pind, dom; for (dom = 0; dom < vm_ndomains; dom++) { db_printf("DOMAIN: %d\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { db_printf("FREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf(" | POOL %d", pind); db_printf("\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf("-- -- "); db_printf("--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { db_printf(" %2.2d (%6.6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; db_printf(" | %6.6d", fl[oind].lcnt); } db_printf("\n"); } db_printf("\n"); } db_printf("\n"); } } #endif diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index 8e3c7eb27078..ec4f821f99ed 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1,2174 +1,2174 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1996, by Steve Passe * All rights reserved. * Copyright (c) 2003 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Local APIC support on Pentium and later processors. */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifdef __amd64__ #define SDT_APIC SDT_SYSIGT #define GSEL_APIC 0 #else #define SDT_APIC SDT_SYS386IGT #define GSEL_APIC GSEL(GCODE_SEL, SEL_KPL) #endif static MALLOC_DEFINE(M_LAPIC, "local_apic", "Local APIC items"); /* Sanity checks on IDT vectors. */ CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT); CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS); CTASSERT(APIC_LOCAL_INTS == 240); CTASSERT(IPI_STOP < APIC_SPURIOUS_INT); /* * I/O interrupts use non-negative IRQ values. These values are used * to mark unused IDT entries or IDT entries reserved for a non-I/O * interrupt. */ #define IRQ_FREE -1 #define IRQ_TIMER -2 #define IRQ_SYSCALL -3 #define IRQ_DTRACE_RET -4 #define IRQ_EVTCHN -5 enum lat_timer_mode { LAT_MODE_UNDEF = 0, LAT_MODE_PERIODIC = 1, LAT_MODE_ONESHOT = 2, LAT_MODE_DEADLINE = 3, }; /* * Support for local APICs. Local APICs manage interrupts on each * individual processor as opposed to I/O APICs which receive interrupts * from I/O devices and then forward them on to the local APICs. * * Local APICs can also send interrupts to each other thus providing the * mechanism for IPIs. */ struct lvt { u_int lvt_edgetrigger:1; u_int lvt_activehi:1; u_int lvt_masked:1; u_int lvt_active:1; u_int lvt_mode:16; u_int lvt_vector:8; }; struct lapic { struct lvt la_lvts[APIC_LVT_MAX + 1]; struct lvt la_elvts[APIC_ELVT_MAX + 1]; u_int la_id:8; u_int la_cluster:4; u_int la_cluster_id:2; u_int la_present:1; u_long *la_timer_count; uint64_t la_timer_period; enum lat_timer_mode la_timer_mode; uint32_t lvt_timer_base; uint32_t lvt_timer_last; /* Include IDT_SYSCALL to make indexing easier. */ int la_ioint_irqs[APIC_NUM_IOINTS + 1]; } static *lapics; /* Global defaults for local APIC LVT entries. */ static struct lvt lvts[APIC_LVT_MAX + 1] = { { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; /* Global defaults for AMD local APIC ELVT entries. */ static struct lvt elvts[APIC_ELVT_MAX + 1] = { { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, }; static inthand_t *ioint_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1), /* 32 - 63 */ IDTVEC(apic_isr2), /* 64 - 95 */ IDTVEC(apic_isr3), /* 96 - 127 */ IDTVEC(apic_isr4), /* 128 - 159 */ IDTVEC(apic_isr5), /* 160 - 191 */ IDTVEC(apic_isr6), /* 192 - 223 */ IDTVEC(apic_isr7), /* 224 - 255 */ }; static inthand_t *ioint_pti_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1_pti), /* 32 - 63 */ IDTVEC(apic_isr2_pti), /* 64 - 95 */ IDTVEC(apic_isr3_pti), /* 96 - 127 */ IDTVEC(apic_isr4_pti), /* 128 - 159 */ IDTVEC(apic_isr5_pti), /* 160 - 191 */ IDTVEC(apic_isr6_pti), /* 192 - 223 */ IDTVEC(apic_isr7_pti), /* 224 - 255 */ }; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd); volatile char *lapic_map; vm_paddr_t lapic_paddr = DEFAULT_APIC_BASE; int x2apic_mode; int lapic_eoi_suppression; static int lapic_timer_tsc_deadline; static u_long lapic_timer_divisor, count_freq; static struct eventtimer lapic_et; #ifdef SMP static uint64_t lapic_ipi_wait_mult; static int __read_mostly lapic_ds_idle_timeout = 1000000; #endif unsigned int max_apic_id; SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "APIC options"); SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, ""); SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD, &lapic_eoi_suppression, 0, ""); SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD, &lapic_timer_tsc_deadline, 0, ""); #ifdef SMP SYSCTL_INT(_hw_apic, OID_AUTO, ds_idle_timeout, CTLFLAG_RWTUN, &lapic_ds_idle_timeout, 0, "timeout (in us) for APIC Delivery Status to become Idle (xAPIC only)"); #endif static void lapic_calibrate_initcount(struct lapic *la); /* * Use __nosanitizethread to exempt the LAPIC I/O accessors from KCSan * instrumentation. Otherwise, if x2APIC is not available, use of the global * lapic_map will generate a KCSan false positive. While the mapping is * shared among all CPUs, the physical access will always take place on the * local CPU's APIC, so there isn't in fact a race here. Furthermore, the * KCSan warning printf can cause a panic if issued during LAPIC access, * due to attempted recursive use of event timer resources. */ static uint32_t __nosanitizethread lapic_read32(enum LAPIC_REGISTERS reg) { uint32_t res; if (x2apic_mode) { res = rdmsr32(MSR_APIC_000 + reg); } else { res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL); } return (res); } static void __nosanitizethread lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val) { if (x2apic_mode) { mfence(); lfence(); wrmsr(MSR_APIC_000 + reg, val); } else { *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; } } static void __nosanitizethread lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val) { if (x2apic_mode) { wrmsr(MSR_APIC_000 + reg, val); } else { *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; } } #ifdef SMP static uint64_t lapic_read_icr_lo(void) { return (lapic_read32(LAPIC_ICR_LO)); } static void lapic_write_icr(uint32_t vhi, uint32_t vlo) { register_t saveintr; uint64_t v; if (x2apic_mode) { v = ((uint64_t)vhi << 32) | vlo; mfence(); wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v); } else { saveintr = intr_disable(); lapic_write32(LAPIC_ICR_HI, vhi); lapic_write32(LAPIC_ICR_LO, vlo); intr_restore(saveintr); } } static void lapic_write_icr_lo(uint32_t vlo) { if (x2apic_mode) { mfence(); wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, vlo); } else { lapic_write32(LAPIC_ICR_LO, vlo); } } static void lapic_write_self_ipi(uint32_t vector) { KASSERT(x2apic_mode, ("SELF IPI write in xAPIC mode")); wrmsr(MSR_APIC_000 + LAPIC_SELF_IPI, vector); } #endif /* SMP */ static void lapic_enable_x2apic(void) { uint64_t apic_base; apic_base = rdmsr(MSR_APICBASE); apic_base |= APICBASE_X2APIC | APICBASE_ENABLED; wrmsr(MSR_APICBASE, apic_base); } bool lapic_is_x2apic(void) { uint64_t apic_base; apic_base = rdmsr(MSR_APICBASE); return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) == (APICBASE_X2APIC | APICBASE_ENABLED)); } static void lapic_enable(void); static void lapic_resume(struct pic *pic, bool suspend_cancelled); static void lapic_timer_oneshot(struct lapic *); static void lapic_timer_oneshot_nointr(struct lapic *, uint32_t); static void lapic_timer_periodic(struct lapic *); static void lapic_timer_deadline(struct lapic *); static void lapic_timer_stop(struct lapic *); static void lapic_timer_set_divisor(u_int divisor); static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); static int lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period); static int lapic_et_stop(struct eventtimer *et); static u_int apic_idt_to_irq(u_int apic_id, u_int vector); static void lapic_set_tpr(u_int vector); struct pic lapic_pic = { .pic_resume = lapic_resume }; static uint32_t lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value) { value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM | APIC_LVT_VECTOR); if (lvt->lvt_edgetrigger == 0) value |= APIC_LVT_TM; if (lvt->lvt_activehi == 0) value |= APIC_LVT_IIPP_INTALO; if (lvt->lvt_masked) value |= APIC_LVT_M; value |= lvt->lvt_mode; switch (lvt->lvt_mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: if (!lvt->lvt_edgetrigger && bootverbose) { printf("lapic%u: Forcing LINT%u to edge trigger\n", la->la_id, pin); value &= ~APIC_LVT_TM; } /* Use a vector of 0. */ break; case APIC_LVT_DM_FIXED: value |= lvt->lvt_vector; break; default: panic("bad APIC LVT delivery mode: %#x\n", value); } return (value); } static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value) { struct lvt *lvt; KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin)); if (la->la_lvts[pin].lvt_active) lvt = &la->la_lvts[pin]; else lvt = &lvts[pin]; return (lvt_mode_impl(la, lvt, pin, value)); } static uint32_t elvt_mode(struct lapic *la, u_int idx, uint32_t value) { struct lvt *elvt; KASSERT(idx <= APIC_ELVT_MAX, ("%s: idx %u out of range", __func__, idx)); elvt = &la->la_elvts[idx]; KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx)); KASSERT(elvt->lvt_edgetrigger, ("%s: ELVT%u is not edge triggered", __func__, idx)); KASSERT(elvt->lvt_activehi, ("%s: ELVT%u is not active high", __func__, idx)); return (lvt_mode_impl(la, elvt, idx, value)); } /* * Map the local APIC and setup necessary interrupt vectors. */ void lapic_init(vm_paddr_t addr) { #ifdef SMP uint64_t r, r1, r2, rx; #endif uint32_t ver; int i; bool arat; /* * Enable x2APIC mode if possible. Map the local APIC * registers page. * * Keep the LAPIC registers page mapped uncached for x2APIC * mode too, to have direct map page attribute set to * uncached. This is needed to work around CPU errata present * on all Intel processors. */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic_paddr = addr; lapic_map = pmap_mapdev(addr, PAGE_SIZE); if (x2apic_mode) { lapic_enable_x2apic(); lapic_map = NULL; } /* Setup the spurious interrupt handler. */ setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Perform basic initialization of the BSP's local APIC. */ lapic_enable(); /* Set BSP's per-CPU local APIC ID. */ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint), SDT_APIC, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { /* Set if APIC timer runs in C3. */ arat = (cpu_power_eax & CPUTPM1_ARAT); bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; lapic_et.et_quality = 100; } if ((cpu_feature & CPUID_TSC) != 0 && (cpu_feature2 & CPUID2_TSCDLT) != 0 && tsc_is_invariant && tsc_freq != 0) { lapic_timer_tsc_deadline = 1; TUNABLE_INT_FETCH("hw.lapic_tsc_deadline", &lapic_timer_tsc_deadline); } lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period = 0x00001000LL; lapic_et.et_max_period = SBT_1S; lapic_et.et_start = lapic_et_start; lapic_et.et_stop = lapic_et_stop; lapic_et.et_priv = NULL; et_register(&lapic_et); } /* * Set lapic_eoi_suppression after lapic_enable(), to not * enable suppression in the hardware prematurely. Note that * we by default enable suppression even when system only has * one IO-APIC, since EOI is broadcasted to all APIC agents, * including CPUs, otherwise. * * It seems that at least some KVM versions report * EOI_SUPPRESSION bit, but auto-EOI does not work. */ ver = lapic_read32(LAPIC_VERSION); if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) { lapic_eoi_suppression = 1; if (vm_guest == VM_GUEST_KVM) { if (bootverbose) printf( "KVM -- disabling lapic eoi suppression\n"); lapic_eoi_suppression = 0; } TUNABLE_INT_FETCH("hw.lapic_eoi_suppression", &lapic_eoi_suppression); } #ifdef SMP #define LOOPS 100000 /* * Calibrate the busy loop waiting for IPI ack in xAPIC mode. * lapic_ipi_wait_mult contains the number of iterations which * approximately delay execution for 1 microsecond (the * argument to lapic_ipi_wait() is in microseconds). * * We assume that TSC is present and already measured. * Possible TSC frequency jumps are irrelevant to the * calibration loop below, the CPU clock management code is * not yet started, and we do not enter sleep states. */ KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0, ("TSC not initialized")); if (!x2apic_mode) { r = rdtsc(); for (rx = 0; rx < LOOPS; rx++) { (void)lapic_read_icr_lo(); ia32_pause(); } r = rdtsc() - r; r1 = tsc_freq * LOOPS; r2 = r * 1000000; lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1; if (bootverbose) { printf("LAPIC: ipi_wait() us multiplier %ju (r %ju " "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r, (uintmax_t)tsc_freq); } } #undef LOOPS #endif /* SMP */ } /* * Create a local APIC instance. */ void lapic_create(u_int apic_id, int boot_cpu) { int i; if (apic_id > max_apic_id) { printf("APIC: Ignoring local APIC with ID %d\n", apic_id); if (boot_cpu) panic("Can't ignore BSP"); return; } KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u", apic_id)); /* * Assume no local LVT overrides and a cluster of 0 and * intra-cluster ID of 0. */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; for (i = 0; i <= APIC_LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } for (i = 0; i <= APIC_ELVT_MAX; i++) { lapics[apic_id].la_elvts[i] = elvts[i]; lapics[apic_id].la_elvts[i].lvt_active = 0; } for (i = 0; i <= APIC_NUM_IOINTS; i++) lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE; lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL; lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] = IRQ_TIMER; #ifdef KDTRACE_HOOKS lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] = IRQ_DTRACE_RET; #endif #ifdef XENHVM lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN; #endif #ifdef SMP cpu_add(apic_id, boot_cpu); #endif } static inline uint32_t amd_read_ext_features(void) { uint32_t version; if (cpu_vendor_id != CPU_VENDOR_AMD && cpu_vendor_id != CPU_VENDOR_HYGON) return (0); version = lapic_read32(LAPIC_VERSION); if ((version & APIC_VER_AMD_EXT_SPACE) != 0) return (lapic_read32(LAPIC_EXT_FEATURES)); else return (0); } static inline uint32_t amd_read_elvt_count(void) { uint32_t extf; uint32_t count; extf = amd_read_ext_features(); count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT; count = min(count, APIC_ELVT_MAX + 1); return (count); } /* * Dump contents of local APIC registers */ void lapic_dump(const char* str) { uint32_t version; uint32_t maxlvt; uint32_t extf; int elvt_count; int i; version = lapic_read32(LAPIC_VERSION); maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; printf("cpu%d %s:\n", PCPU_GET(cpuid), str); printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x", lapic_read32(LAPIC_ID), version, lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR)); if ((cpu_feature2 & CPUID2_X2APIC) != 0) printf(" x2APIC: %d", x2apic_mode); printf("\n lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1), lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR)); printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x", lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL), lapic_read32(LAPIC_LVT_ERROR)); if (maxlvt >= APIC_LVT_PMC) printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT)); printf("\n"); if (maxlvt >= APIC_LVT_CMCI) printf(" cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI)); extf = amd_read_ext_features(); if (extf != 0) { printf(" AMD ext features: 0x%08x", extf); elvt_count = amd_read_elvt_count(); for (i = 0; i < elvt_count; i++) printf("%s elvt%d: 0x%08x", (i % 4) ? "" : "\n ", i, lapic_read32(LAPIC_EXT_LVT0 + i)); printf("\n"); } } void lapic_xapic_mode(void) { register_t saveintr; saveintr = intr_disable(); if (x2apic_mode) lapic_enable_x2apic(); intr_restore(saveintr); } void lapic_setup(int boot) { struct lapic *la; uint32_t version; uint32_t maxlvt; register_t saveintr; int elvt_count; int i; saveintr = intr_disable(); la = &lapics[lapic_id()]; KASSERT(la->la_present, ("missing APIC structure")); version = lapic_read32(LAPIC_VERSION); maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; /* Initialize the TPR to allow all interrupts. */ lapic_set_tpr(0); /* Setup spurious vector and enable the local APIC. */ lapic_enable(); /* Program LINT[01] LVT entries. */ lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0, lapic_read32(LAPIC_LVT_LINT0))); lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1, lapic_read32(LAPIC_LVT_LINT1))); /* Program the PMC LVT entry if present. */ if (maxlvt >= APIC_LVT_PMC) { lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, LAPIC_LVT_PCINT)); } /* * Program the timer LVT. Calibration is deferred until it is certain * that we have a reliable timecounter. */ la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER, lapic_read32(LAPIC_LVT_TIMER)); la->lvt_timer_last = la->lvt_timer_base; lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base); if (boot) la->la_timer_mode = LAT_MODE_UNDEF; else if (la->la_timer_mode != LAT_MODE_UNDEF) { KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor", lapic_id())); switch (la->la_timer_mode) { case LAT_MODE_PERIODIC: lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_periodic(la); break; case LAT_MODE_ONESHOT: lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_oneshot(la); break; case LAT_MODE_DEADLINE: lapic_timer_deadline(la); break; default: panic("corrupted la_timer_mode %p %d", la, la->la_timer_mode); } } /* Program error LVT and clear any existing errors. */ lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR, lapic_read32(LAPIC_LVT_ERROR))); lapic_write32(LAPIC_ESR, 0); /* XXX: Thermal LVT */ /* Program the CMCI LVT entry if present. */ if (maxlvt >= APIC_LVT_CMCI) { lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI, lapic_read32(LAPIC_LVT_CMCI))); } elvt_count = amd_read_elvt_count(); for (i = 0; i < elvt_count; i++) { if (la->la_elvts[i].lvt_active) lapic_write32(LAPIC_EXT_LVT0 + i, elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i))); } intr_restore(saveintr); } static void lapic_intrcnt(void *dummy __unused) { struct pcpu *pc; struct lapic *la; char buf[MAXCOMLEN + 1]; /* If there are no APICs, skip this function. */ if (lapics == NULL) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { la = &lapics[pc->pc_apic_id]; if (!la->la_present) continue; snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid); intrcnt_add(buf, &la->la_timer_count); } } SYSINIT(lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, lapic_intrcnt, NULL); void lapic_reenable_pmc(void) { #ifdef HWPMC_HOOKS uint32_t value; value = lapic_read32(LAPIC_LVT_PCINT); value &= ~APIC_LVT_M; lapic_write32(LAPIC_LVT_PCINT, value); #endif } #ifdef HWPMC_HOOKS static void lapic_update_pmc(void *dummy) { struct lapic *la; la = &lapics[lapic_id()]; lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, lapic_read32(LAPIC_LVT_PCINT))); } #endif void lapic_calibrate_timer(void) { struct lapic *la; register_t intr; #ifdef DEV_ATPIC /* Fail if the local APIC is not present. */ if (!x2apic_mode && lapic_map == NULL) return; #endif intr = intr_disable(); la = &lapics[lapic_id()]; lapic_calibrate_initcount(la); intr_restore(intr); if (lapic_timer_tsc_deadline && bootverbose) { printf("lapic: deadline tsc mode, Frequency %ju Hz\n", (uintmax_t)tsc_freq); } } int lapic_enable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; #ifdef DEV_ATPIC /* Fail if the local APIC is not present. */ if (!x2apic_mode && lapic_map == NULL) return (0); #endif /* Fail if the PMC LVT is not present. */ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return (0); lvts[APIC_LVT_PMC].lvt_masked = 0; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); #else #ifdef SMP /* * If hwpmc was loaded at boot time then the APs may not be * started yet. In that case, don't forward the request to * them as they will program the lvt when they start. */ if (smp_started) smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); else #endif lapic_update_pmc(NULL); #endif return (1); #else return (0); #endif } void lapic_disable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; #ifdef DEV_ATPIC /* Fail if the local APIC is not present. */ if (!x2apic_mode && lapic_map == NULL) return; #endif /* Fail if the PMC LVT is not present. */ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return; lvts[APIC_LVT_PMC].lvt_masked = 1; #ifdef SMP /* The APs should always be started when hwpmc is unloaded. */ KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early")); #endif smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); #endif } static int lapic_calibrate_initcount_cpuid_vm(void) { u_int regs[4]; uint64_t freq; /* Get value from CPUID leaf if possible. */ if (vm_guest == VM_GUEST_NO) return (false); if (hv_high < 0x40000010) return (false); do_cpuid(0x40000010, regs); freq = (uint64_t)(regs[1]) * 1000; /* Pick timer divisor. */ lapic_timer_divisor = 2; do { if (freq / lapic_timer_divisor < APIC_TIMER_MAX_COUNT) break; lapic_timer_divisor <<= 1; } while (lapic_timer_divisor <= 128); if (lapic_timer_divisor > 128) return (false); /* Record divided frequency. */ count_freq = freq / lapic_timer_divisor; return (true); } static uint64_t cb_lapic_getcount(void) { return (APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER)); } static void lapic_calibrate_initcount(struct lapic *la) { uint64_t freq; if (lapic_calibrate_initcount_cpuid_vm()) goto done; /* Calibrate the APIC timer frequency. */ lapic_timer_set_divisor(2); lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT); fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); freq = clockcalib(cb_lapic_getcount, "lapic"); fpu_kern_leave(curthread, NULL); /* Pick a different divisor if necessary. */ lapic_timer_divisor = 2; do { if (freq * 2 / lapic_timer_divisor < APIC_TIMER_MAX_COUNT) break; lapic_timer_divisor <<= 1; } while (lapic_timer_divisor <= 128); if (lapic_timer_divisor > 128) panic("lapic: Divisor too big"); count_freq = freq * 2 / lapic_timer_divisor; done: if (bootverbose) { printf("lapic: Divisor %lu, Frequency %lu Hz\n", lapic_timer_divisor, count_freq); } } static void lapic_change_mode(struct eventtimer *et, struct lapic *la, enum lat_timer_mode newmode) { if (la->la_timer_mode == newmode) return; switch (newmode) { case LAT_MODE_PERIODIC: lapic_timer_set_divisor(lapic_timer_divisor); et->et_frequency = count_freq; break; case LAT_MODE_DEADLINE: et->et_frequency = tsc_freq; break; case LAT_MODE_ONESHOT: lapic_timer_set_divisor(lapic_timer_divisor); et->et_frequency = count_freq; break; default: panic("lapic_change_mode %d", newmode); } la->la_timer_mode = newmode; et->et_min_period = (0x00000002LLU << 32) / et->et_frequency; et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency; } static int lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct lapic *la; la = &lapics[PCPU_GET(apic_id)]; if (period != 0) { lapic_change_mode(et, la, LAT_MODE_PERIODIC); la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32; lapic_timer_periodic(la); } else if (lapic_timer_tsc_deadline) { lapic_change_mode(et, la, LAT_MODE_DEADLINE); la->la_timer_period = (et->et_frequency * first) >> 32; lapic_timer_deadline(la); } else { lapic_change_mode(et, la, LAT_MODE_ONESHOT); la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32; lapic_timer_oneshot(la); } return (0); } static int lapic_et_stop(struct eventtimer *et) { struct lapic *la; la = &lapics[PCPU_GET(apic_id)]; lapic_timer_stop(la); la->la_timer_mode = LAT_MODE_UNDEF; return (0); } void lapic_disable(void) { uint32_t value; /* Software disable the local APIC. */ value = lapic_read32(LAPIC_SVR); value &= ~APIC_SVR_SWEN; lapic_write32(LAPIC_SVR, value); } static void lapic_enable(void) { uint32_t value; /* Program the spurious vector to enable the local APIC. */ value = lapic_read32(LAPIC_SVR); value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT; if (lapic_eoi_suppression) value |= APIC_SVR_EOI_SUPPRESSION; lapic_write32(LAPIC_SVR, value); } /* Reset the local APIC on the BSP during resume. */ static void lapic_resume(struct pic *pic, bool suspend_cancelled) { lapic_setup(0); } int lapic_id(void) { uint32_t v; KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped")); v = lapic_read32(LAPIC_ID); if (!x2apic_mode) v >>= APIC_ID_SHIFT; return (v); } int lapic_intr_pending(u_int vector) { uint32_t irr; /* * The IRR registers are an array of registers each of which * only describes 32 interrupts in the low 32 bits. Thus, we * divide the vector by 32 to get the register index. * Finally, we modulus the vector by 32 to determine the * individual bit to test. */ irr = lapic_read32(LAPIC_IRR0 + vector / 32); return (irr & 1 << (vector % 32)); } void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { struct lapic *la; KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist", __func__, apic_id)); KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big", __func__, cluster)); KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID, ("%s: intra cluster id %u too big", __func__, cluster_id)); la = &lapics[apic_id]; la->la_cluster = cluster; la->la_cluster_id = cluster_id; } int lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) { if (pin > APIC_LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_masked = masked; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_masked = masked; lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked"); return (0); } int lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) { struct lvt *lvt; if (pin > APIC_LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvt = &lvts[pin]; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lvt = &lapics[apic_id].la_lvts[pin]; lvt->lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } lvt->lvt_mode = mode; switch (mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: lvt->lvt_edgetrigger = 1; lvt->lvt_activehi = 1; if (mode == APIC_LVT_DM_EXTINT) lvt->lvt_masked = 1; else lvt->lvt_masked = 0; break; default: panic("Unsupported delivery mode: 0x%x\n", mode); } if (bootverbose) { printf(" Routing "); switch (mode) { case APIC_LVT_DM_NMI: printf("NMI"); break; case APIC_LVT_DM_SMI: printf("SMI"); break; case APIC_LVT_DM_INIT: printf("INIT"); break; case APIC_LVT_DM_EXTINT: printf("ExtINT"); break; } printf(" -> LINT%u\n", pin); } return (0); } int lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) { if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_active = 1; lapics[apic_id].la_lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u polarity: %s\n", pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } int lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger) { if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u trigger: %s\n", pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Adjust the TPR of the current CPU so that it blocks all interrupts below * the passed in vector. */ static void lapic_set_tpr(u_int vector) { #ifdef CHEAP_TPR lapic_write32(LAPIC_TPR, vector); #else uint32_t tpr; tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO; tpr |= vector; lapic_write32(LAPIC_TPR, tpr); #endif } void lapic_eoi(void) { lapic_write32_nofence(LAPIC_EOI, 0); } void lapic_handle_intr(int vector, struct trapframe *frame) { struct intsrc *isrc; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); kmsan_mark(&vector, sizeof(vector), KMSAN_STATE_INITED); kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id), vector)); intr_execute_handlers(isrc, frame); } void lapic_handle_timer(struct trapframe *frame) { struct lapic *la; struct trapframe *oldframe; struct thread *td; /* Send EOI first thing. */ lapic_eoi(); kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); #if defined(SMP) && !defined(SCHED_ULE) /* * Don't do any accounting for the disabled HTT cores, since it * will provide misleading numbers for the userland. * * No locking is necessary here, since even if we lose the race * when hlt_cpus_mask changes it is not a big deal, really. * * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask * and unlike other schedulers it actually schedules threads to * those CPUs. */ if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask)) return; #endif /* Look up our local APIC structure for the tick counters. */ la = &lapics[PCPU_GET(apic_id)]; (*la->la_timer_count)++; critical_enter(); if (lapic_et.et_active) { td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = frame; lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; } critical_exit(); } static void lapic_timer_set_divisor(u_int divisor) { KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor)); KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors), ("lapic: invalid divisor %u", divisor)); lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]); } static void lapic_timer_oneshot(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_ONE_SHOT; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count) { uint32_t value; value = la->lvt_timer_base; value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, count); } static void lapic_timer_periodic(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_PERIODIC; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void lapic_timer_deadline(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_TSCDLT; if (value != la->lvt_timer_last) { la->lvt_timer_last = value; lapic_write32_nofence(LAPIC_LVT_TIMER, value); if (!x2apic_mode) mfence(); } wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc()); } static void lapic_timer_stop(struct lapic *la) { uint32_t value; if (la->la_timer_mode == LAT_MODE_DEADLINE) { wrmsr(MSR_TSC_DEADLINE, 0); mfence(); } else { value = la->lvt_timer_base; value &= ~APIC_LVTT_TM; value |= APIC_LVT_M; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); } } void lapic_handle_cmc(void) { lapic_eoi(); cmc_intr(); } /* * Called from the mca_init() to activate the CMC interrupt if this CPU is * responsible for monitoring any MC banks for CMC events. Since mca_init() * is called prior to lapic_setup() during boot, this just needs to unmask * this CPU's LVT_CMCI entry. */ void lapic_enable_cmc(void) { u_int apic_id; #ifdef DEV_ATPIC if (!x2apic_mode && lapic_map == NULL) return; #endif apic_id = PCPU_GET(apic_id); KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0; lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1; } int lapic_enable_mca_elvt(void) { u_int apic_id; uint32_t value; int elvt_count; #ifdef DEV_ATPIC if (lapic_map == NULL) return (-1); #endif apic_id = PCPU_GET(apic_id); KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); elvt_count = amd_read_elvt_count(); if (elvt_count <= APIC_ELVT_MCA) return (-1); value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA); if ((value & APIC_LVT_M) == 0) { if (bootverbose) printf("AMD MCE Thresholding Extended LVT is already active\n"); return (APIC_ELVT_MCA); } lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0; lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1; return (APIC_ELVT_MCA); } void lapic_handle_error(void) { uint32_t esr; /* * Read the contents of the error status register. Write to * the register first before reading from it to force the APIC * to update its value to indicate any errors that have * occurred since the previous write to the register. */ lapic_write32(LAPIC_ESR, 0); esr = lapic_read32(LAPIC_ESR); printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); lapic_eoi(); } u_int apic_cpuid(u_int apic_id) { #ifdef SMP return apic_cpuids[apic_id]; #else return 0; #endif } /* Request a free IDT vector to be used by the specified IRQ. */ u_int apic_alloc_vector(u_int apic_id, u_int irq) { u_int vector; KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq)); /* * Search for a free vector. Currently we just use a very simple * algorithm to find the first free vector. */ mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) continue; lapics[apic_id].la_ioint_irqs[vector] = irq; mtx_unlock_spin(&icu_lock); return (vector + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); return (0); } /* * Request 'count' free contiguous IDT vectors to be used by 'count' * IRQs. 'count' must be a power of two and the vectors will be * aligned on a boundary of 'align'. If the request cannot be * satisfied, 0 is returned. */ u_int apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { u_int first, run, vector; KASSERT(powerof2(count), ("bad count")); KASSERT(powerof2(align), ("bad align")); KASSERT(align >= count, ("align < count")); #ifdef INVARIANTS for (run = 0; run < count; run++) KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u", irqs[run], run)); #endif /* * Search for 'count' free vectors. As with apic_alloc_vector(), * this just uses a simple first fit algorithm. */ run = 0; first = 0; mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { /* Vector is in use, end run. */ if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) { run = 0; first = 0; continue; } /* Start a new run if run == 0 and vector is aligned. */ if (run == 0) { if ((vector & (align - 1)) != 0) continue; first = vector; } run++; /* Keep looping if the run isn't long enough yet. */ if (run < count) continue; /* Found a run, assign IRQs and return the first vector. */ for (vector = 0; vector < count; vector++) lapics[apic_id].la_ioint_irqs[first + vector] = irqs[vector]; mtx_unlock_spin(&icu_lock); return (first + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count); return (0); } /* * Enable a vector for a particular apic_id. Since all lapics share idt * entries and ioint_handlers this enables the vector on all lapics. lapics * which do not have the vector configured would report spurious interrupts * should it fire. */ void apic_enable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32], SDT_APIC, SEL_KPL, GSEL_APIC); } void apic_disable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef notyet /* * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC, SEL_KPL, GSEL_APIC); #endif } /* Release an APIC vector when it's no longer in use. */ void apic_free_vector(u_int apic_id, u_int vector, u_int irq) { struct thread *td; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq)); KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] == irq, ("IRQ mismatch")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif /* * Bind us to the cpu that owned the vector before freeing it so * we don't lose an interrupt delivery race. */ td = curthread; if (!rebooting) { thread_lock(td); if (sched_is_bound(td)) panic("apic_free_vector: Thread already bound.\n"); sched_bind(td, apic_cpuid(apic_id)); thread_unlock(td); } mtx_lock_spin(&icu_lock); lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE; mtx_unlock_spin(&icu_lock); if (!rebooting) { thread_lock(td); sched_unbind(td); thread_unlock(td); } } /* Map an IDT vector (APIC) to an IRQ (interrupt source). */ static u_int apic_idt_to_irq(u_int apic_id, u_int vector) { int irq; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS]; if (irq < 0) irq = 0; return (irq); } #ifdef DDB /* * Dump data about APIC IDT vector mappings. */ -DB_SHOW_COMMAND(apic, db_show_apic) +DB_SHOW_COMMAND_FLAGS(apic, db_show_apic, DB_CMD_MEMSAFE) { struct intsrc *isrc; int i, verbose; u_int apic_id; u_int irq; if (strcmp(modif, "vv") == 0) verbose = 2; else if (strcmp(modif, "v") == 0) verbose = 1; else verbose = 0; for (apic_id = 0; apic_id <= max_apic_id; apic_id++) { if (lapics[apic_id].la_present == 0) continue; db_printf("Interrupts bound to lapic %u\n", apic_id); for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) { irq = lapics[apic_id].la_ioint_irqs[i]; if (irq == IRQ_FREE || irq == IRQ_SYSCALL) continue; #ifdef KDTRACE_HOOKS if (irq == IRQ_DTRACE_RET) continue; #endif #ifdef XENHVM if (irq == IRQ_EVTCHN) continue; #endif db_printf("vec 0x%2x -> ", i + APIC_IO_INTS); if (irq == IRQ_TIMER) db_printf("lapic timer\n"); else if (irq < num_io_irqs) { isrc = intr_lookup_source(irq); if (isrc == NULL || verbose == 0) db_printf("IRQ %u\n", irq); else db_dump_intr_event(isrc->is_event, verbose == 2); } else db_printf("IRQ %u ???\n", irq); } } } static void dump_mask(const char *prefix, uint32_t v, int base) { int i, first; first = 1; for (i = 0; i < 32; i++) if (v & (1 << i)) { if (first) { db_printf("%s:", prefix); first = 0; } db_printf(" %02x", base + i); } if (!first) db_printf("\n"); } /* Show info from the lapic regs for this CPU. */ -DB_SHOW_COMMAND(lapic, db_show_lapic) +DB_SHOW_COMMAND_FLAGS(lapic, db_show_lapic, DB_CMD_MEMSAFE) { uint32_t v; db_printf("lapic ID = %d\n", lapic_id()); v = lapic_read32(LAPIC_VERSION); db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4, v & 0xf); db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT); v = lapic_read32(LAPIC_SVR); db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR, v & APIC_SVR_ENABLE ? "enabled" : "disabled"); db_printf("TPR = %02x\n", lapic_read32(LAPIC_TPR)); #define dump_field(prefix, regn, index) \ dump_mask(__XSTRING(prefix ## index), \ lapic_read32(LAPIC_ ## regn ## index), \ index * 32) db_printf("In-service Interrupts:\n"); dump_field(isr, ISR, 0); dump_field(isr, ISR, 1); dump_field(isr, ISR, 2); dump_field(isr, ISR, 3); dump_field(isr, ISR, 4); dump_field(isr, ISR, 5); dump_field(isr, ISR, 6); dump_field(isr, ISR, 7); db_printf("TMR Interrupts:\n"); dump_field(tmr, TMR, 0); dump_field(tmr, TMR, 1); dump_field(tmr, TMR, 2); dump_field(tmr, TMR, 3); dump_field(tmr, TMR, 4); dump_field(tmr, TMR, 5); dump_field(tmr, TMR, 6); dump_field(tmr, TMR, 7); db_printf("IRR Interrupts:\n"); dump_field(irr, IRR, 0); dump_field(irr, IRR, 1); dump_field(irr, IRR, 2); dump_field(irr, IRR, 3); dump_field(irr, IRR, 4); dump_field(irr, IRR, 5); dump_field(irr, IRR, 6); dump_field(irr, IRR, 7); #undef dump_field } #endif /* * APIC probing support code. This includes code to manage enumerators. */ static SLIST_HEAD(, apic_enumerator) enumerators = SLIST_HEAD_INITIALIZER(enumerators); static struct apic_enumerator *best_enum; void apic_register_enumerator(struct apic_enumerator *enumerator) { #ifdef INVARIANTS struct apic_enumerator *apic_enum; SLIST_FOREACH(apic_enum, &enumerators, apic_next) { if (apic_enum == enumerator) panic("%s: Duplicate register of %s", __func__, enumerator->apic_name); } #endif SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next); } /* * We have to look for CPU's very, very early because certain subsystems * want to know how many CPU's we have extremely early on in the boot * process. */ static void apic_init(void *dummy __unused) { struct apic_enumerator *enumerator; int retval, best; /* We only support built in local APICs. */ if (!(cpu_feature & CPUID_APIC)) return; /* Don't probe if APIC mode is disabled. */ if (resource_disabled("apic", 0)) return; /* Probe all the enumerators to find the best match. */ best_enum = NULL; best = 0; SLIST_FOREACH(enumerator, &enumerators, apic_next) { retval = enumerator->apic_probe(); if (retval > 0) continue; if (best_enum == NULL || best < retval) { best_enum = enumerator; best = retval; } } if (best_enum == NULL) { if (bootverbose) printf("APIC: Could not find any APICs.\n"); #ifndef DEV_ATPIC panic("running without device atpic requires a local APIC"); #endif return; } if (bootverbose) printf("APIC: Using the %s enumerator.\n", best_enum->apic_name); #ifdef I686_CPU /* * To work around an errata, we disable the local APIC on some * CPUs during early startup. We need to turn the local APIC back * on on such CPUs now. */ ppro_reenable_apic(); #endif /* Probe the CPU's in the system. */ retval = best_enum->apic_probe_cpus(); if (retval != 0) printf("%s: Failed to probe CPUs: returned %d\n", best_enum->apic_name, retval); } SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL); /* * Setup the local APIC. We have to do this prior to starting up the APs * in the SMP case. */ static void apic_setup_local(void *dummy __unused) { int retval; if (best_enum == NULL) return; lapics = malloc(sizeof(*lapics) * (max_apic_id + 1), M_LAPIC, M_WAITOK | M_ZERO); /* Initialize the local APIC. */ retval = best_enum->apic_setup_local(); if (retval != 0) printf("%s: Failed to setup the local APIC: returned %d\n", best_enum->apic_name, retval); } SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL); /* * Setup the I/O APICs. */ static void apic_setup_io(void *dummy __unused) { int retval; if (best_enum == NULL) return; /* * Local APIC must be registered before other PICs and pseudo PICs * for proper suspend/resume order. */ intr_register_pic(&lapic_pic); retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); /* * Finish setting up the local APIC on the BSP once we know * how to properly program the LINT pins. In particular, this * enables the EOI suppression mode, if LAPIC supports it and * user did not disable the mode. */ lapic_setup(1); if (bootverbose) lapic_dump("BSP"); /* Enable the MSI "pic". */ msi_init(); #ifdef XENHVM xen_intr_alloc_irqs(); #endif } SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL); #ifdef SMP /* * Inter Processor Interrupt functions. The lapic_ipi_*() functions are * private to the MD code. The public interface for the rest of the * kernel is defined in mp_machdep.c. */ /* * Wait delay microseconds for IPI to be sent. If delay is -1, we * wait forever. */ int lapic_ipi_wait(int delay) { uint64_t rx; /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */ if (x2apic_mode) return (1); for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) { if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) return (1); ia32_pause(); } return (0); } void lapic_ipi_raw(register_t icrlo, u_int dest) { uint32_t icrhi; /* XXX: Need more sanity checking of icrlo? */ KASSERT(x2apic_mode || lapic_map != NULL, ("%s called too early", __func__)); KASSERT(x2apic_mode || (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid dest field", __func__)); KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, ("%s: reserved bits set in ICR LO register", __func__)); if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { if (x2apic_mode) icrhi = dest; else icrhi = dest << APIC_ID_SHIFT; lapic_write_icr(icrhi, icrlo); } else { lapic_write_icr_lo(icrlo); } } #ifdef DETECT_DEADLOCK #define AFTER_SPIN 50 #endif static void native_lapic_ipi_vectored(u_int vector, int dest) { register_t icrlo, destfield; KASSERT((vector & ~APIC_VECTOR_MASK) == 0, ("%s: invalid vector %d", __func__, vector)); destfield = 0; switch (dest) { case APIC_IPI_DEST_SELF: if (x2apic_mode && vector < IPI_NMI_FIRST) { lapic_write_self_ipi(vector); return; } icrlo = APIC_DEST_SELF; break; case APIC_IPI_DEST_ALL: icrlo = APIC_DEST_ALLISELF; break; case APIC_IPI_DEST_OTHERS: icrlo = APIC_DEST_ALLESELF; break; default: icrlo = 0; KASSERT(x2apic_mode || (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid destination 0x%x", __func__, dest)); destfield = dest; } /* * NMI IPIs are just fake vectors used to send a NMI. Use special rules * regarding NMIs if passed, otherwise specify the vector. */ if (vector >= IPI_NMI_FIRST) icrlo |= APIC_DELMODE_NMI; else icrlo |= vector | APIC_DELMODE_FIXED; icrlo |= APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT; /* Wait for an earlier IPI to finish. */ if (!lapic_ipi_wait(lapic_ds_idle_timeout)) { if (KERNEL_PANICKED()) return; else panic("APIC: Previous IPI is stuck"); } lapic_ipi_raw(icrlo, destfield); #ifdef DETECT_DEADLOCK /* Wait for IPI to be delivered. */ if (!lapic_ipi_wait(AFTER_SPIN)) { #ifdef needsattention /* * XXX FIXME: * * The above function waits for the message to actually be * delivered. It breaks out after an arbitrary timeout * since the message should eventually be delivered (at * least in theory) and that if it wasn't we would catch * the failure with the check above when the next IPI is * sent. * * We could skip this wait entirely, EXCEPT it probably * protects us from other routines that assume that the * message was delivered and acted upon when this function * returns. */ printf("APIC: IPI might be stuck\n"); #else /* !needsattention */ /* Wait until mesage is sent without a timeout. */ while (lapic_read_icr_lo() & APIC_DELSTAT_PEND) ia32_pause(); #endif /* needsattention */ } #endif /* DETECT_DEADLOCK */ } void (*ipi_vectored)(u_int, int) = &native_lapic_ipi_vectored; #endif /* SMP */ /* * Since the IDT is shared by all CPUs the IPI slot update needs to be globally * visible. * * Consider the case where an IPI is generated immediately after allocation: * vector = lapic_ipi_alloc(ipifunc); * ipi_selected(other_cpus, vector); * * In xAPIC mode a write to ICR_LO has serializing semantics because the * APIC page is mapped as an uncached region. In x2APIC mode there is an * explicit 'mfence' before the ICR MSR is written. Therefore in both cases * the IDT slot update is globally visible before the IPI is delivered. */ int lapic_ipi_alloc(inthand_t *ipifunc) { struct gate_descriptor *ip; long func; int idx, vector; KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti), ("invalid ipifunc %p", ipifunc)); vector = -1; mtx_lock_spin(&icu_lock); for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) { ip = &idt[idx]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; #ifdef __i386__ func -= setidt_disp; #endif if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) || (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) { vector = idx; setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC); break; } } mtx_unlock_spin(&icu_lock); return (vector); } void lapic_ipi_free(int vector) { struct gate_descriptor *ip; long func __diagused; KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST, ("%s: invalid vector %d", __func__, vector)); mtx_lock_spin(&icu_lock); ip = &idt[vector]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; #ifdef __i386__ func -= setidt_disp; #endif KASSERT(func != (uintptr_t)&IDTVEC(rsvd) && func != (uintptr_t)&IDTVEC(rsvd_pti), ("invalid idtfunc %#lx", func)); setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC, SEL_KPL, GSEL_APIC); mtx_unlock_spin(&icu_lock); }