Index: head/libexec/rpc.rstatd/rstat_proc.c =================================================================== --- head/libexec/rpc.rstatd/rstat_proc.c (revision 317060) +++ head/libexec/rpc.rstatd/rstat_proc.c (revision 317061) @@ -1,475 +1,477 @@ /* * Sun RPC is a product of Sun Microsystems, Inc. and is provided for * unrestricted use provided that this legend is included on all tape * media and as a part of the software program in whole or part. Users * may copy or modify Sun RPC without charge, but are not authorized * to license or distribute it to anyone else except as part of a product or * program developed by the user. * * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE * WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. * * Sun RPC is provided with no support and without any obligation on the * part of Sun Microsystems, Inc. to assist in its use, correction, * modification or enhancement. * * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC * OR ANY PART THEREOF. * * In no event will Sun Microsystems, Inc. be liable for any lost revenue * or profits or other special, indirect and consequential damages, even if * Sun has been advised of the possibility of such damages. * * Sun Microsystems, Inc. * 2550 Garcia Avenue * Mountain View, California 94043 */ #ifndef lint #if 0 static char sccsid[] = "from: @(#)rpc.rstatd.c 1.1 86/09/25 Copyr 1984 Sun Micro"; static char sccsid[] = "from: @(#)rstat_proc.c 2.2 88/08/01 4.0 RPCSRC"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* * rstat service: built with rstat.x and derived from rpc.rstatd.c * * Copyright (c) 1984 by Sun Microsystems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef FSHIFT /* Use protocol's shift and scale values */ #undef FSCALE #undef if_ipackets #undef if_ierrors #undef if_opackets #undef if_oerrors #undef if_collisions #include int haveadisk(void); void updatexfers(int, int *); int stats_service(void); extern int from_inetd; int sincelastreq = 0; /* number of alarms since last request */ extern int closedown; union { struct stats s1; struct statsswtch s2; struct statstime s3; } stats_all; void updatestat(); static int stat_is_init = 0; static int cp_time_xlat[RSTAT_CPUSTATES] = { CP_USER, CP_NICE, CP_SYS, CP_IDLE }; static long bsd_cp_time[CPUSTATES]; #ifndef FSCALE #define FSCALE (1 << 8) #endif void stat_init(void) { stat_is_init = 1; alarm(0); updatestat(); (void) signal(SIGALRM, updatestat); alarm(1); } statstime * rstatproc_stats_3_svc(void *argp, struct svc_req *rqstp) { if (! stat_is_init) stat_init(); sincelastreq = 0; return(&stats_all.s3); } statsswtch * rstatproc_stats_2_svc(void *argp, struct svc_req *rqstp) { if (! stat_is_init) stat_init(); sincelastreq = 0; return(&stats_all.s2); } stats * rstatproc_stats_1_svc(void *argp, struct svc_req *rqstp) { if (! stat_is_init) stat_init(); sincelastreq = 0; return(&stats_all.s1); } u_int * rstatproc_havedisk_3_svc(void *argp, struct svc_req *rqstp) { static u_int have; if (! stat_is_init) stat_init(); sincelastreq = 0; have = haveadisk(); return(&have); } u_int * rstatproc_havedisk_2_svc(void *argp, struct svc_req *rqstp) { return(rstatproc_havedisk_3_svc(argp, rqstp)); } u_int * rstatproc_havedisk_1_svc(void *argp, struct svc_req *rqstp) { return(rstatproc_havedisk_3_svc(argp, rqstp)); } void updatestat(void) { int i, hz; struct clockinfo clockrate; struct ifmibdata ifmd; double avrun[3]; struct timeval tm, btm; int mib[6]; size_t len; + uint64_t val; int ifcount; #ifdef DEBUG fprintf(stderr, "entering updatestat\n"); #endif if (sincelastreq >= closedown) { #ifdef DEBUG fprintf(stderr, "about to closedown\n"); #endif if (from_inetd) exit(0); else { stat_is_init = 0; return; } } sincelastreq++; mib[0] = CTL_KERN; mib[1] = KERN_CLOCKRATE; len = sizeof clockrate; if (sysctl(mib, 2, &clockrate, &len, 0, 0) < 0) { syslog(LOG_ERR, "sysctl(kern.clockrate): %m"); exit(1); } hz = clockrate.hz; len = sizeof(bsd_cp_time); if (sysctlbyname("kern.cp_time", bsd_cp_time, &len, 0, 0) < 0) { syslog(LOG_ERR, "sysctl(kern.cp_time): %m"); exit(1); } for(i = 0; i < RSTAT_CPUSTATES ; i++) stats_all.s1.cp_time[i] = bsd_cp_time[cp_time_xlat[i]]; (void)getloadavg(avrun, sizeof(avrun) / sizeof(avrun[0])); stats_all.s2.avenrun[0] = avrun[0] * FSCALE; stats_all.s2.avenrun[1] = avrun[1] * FSCALE; stats_all.s2.avenrun[2] = avrun[2] * FSCALE; mib[0] = CTL_KERN; mib[1] = KERN_BOOTTIME; len = sizeof btm; if (sysctl(mib, 2, &btm, &len, 0, 0) < 0) { syslog(LOG_ERR, "sysctl(kern.boottime): %m"); exit(1); } stats_all.s2.boottime.tv_sec = btm.tv_sec; stats_all.s2.boottime.tv_usec = btm.tv_usec; #ifdef DEBUG fprintf(stderr, "%d %d %d %d\n", stats_all.s1.cp_time[0], stats_all.s1.cp_time[1], stats_all.s1.cp_time[2], stats_all.s1.cp_time[3]); #endif #define FETCH_CNT(stat, cnt) do { \ - len = sizeof((stat)); \ - if (sysctlbyname("vm.stats." #cnt , &(stat), &len, 0, 0) < 0) { \ - syslog(LOG_ERR, "sysctl(vm.stats." #cnt "): %m"); \ + len = sizeof(uint64_t); \ + if (sysctlbyname("vm.stats." #cnt , &val, &len, NULL, 0) < 0) { \ + syslog(LOG_ERR, "sysctl(vm.stats." #cnt "): %m"); \ exit(1); \ } \ + stat = val; \ } while (0) FETCH_CNT(stats_all.s1.v_pgpgin, vm.v_vnodepgsin); FETCH_CNT(stats_all.s1.v_pgpgout, vm.v_vnodepgsout); FETCH_CNT(stats_all.s1.v_pswpin, vm.v_swappgsin); FETCH_CNT(stats_all.s1.v_pswpout, vm.v_swappgsout); FETCH_CNT(stats_all.s1.v_intr, sys.v_intr); FETCH_CNT(stats_all.s2.v_swtch, sys.v_swtch); (void)gettimeofday(&tm, NULL); stats_all.s1.v_intr -= hz*(tm.tv_sec - btm.tv_sec) + hz*(tm.tv_usec - btm.tv_usec)/1000000; /* update disk transfers */ updatexfers(RSTAT_DK_NDRIVE, stats_all.s1.dk_xfer); mib[0] = CTL_NET; mib[1] = PF_LINK; mib[2] = NETLINK_GENERIC; mib[3] = IFMIB_SYSTEM; mib[4] = IFMIB_IFCOUNT; len = sizeof ifcount; if (sysctl(mib, 5, &ifcount, &len, 0, 0) < 0) { syslog(LOG_ERR, "sysctl(net.link.generic.system.ifcount): %m"); exit(1); } stats_all.s1.if_ipackets = 0; stats_all.s1.if_opackets = 0; stats_all.s1.if_ierrors = 0; stats_all.s1.if_oerrors = 0; stats_all.s1.if_collisions = 0; for (i = 1; i <= ifcount; i++) { len = sizeof ifmd; mib[3] = IFMIB_IFDATA; mib[4] = i; mib[5] = IFDATA_GENERAL; if (sysctl(mib, 6, &ifmd, &len, 0, 0) < 0) { if (errno == ENOENT) continue; syslog(LOG_ERR, "sysctl(net.link.ifdata.%d.general)" ": %m", i); exit(1); } stats_all.s1.if_ipackets += ifmd.ifmd_data.ifi_ipackets; stats_all.s1.if_opackets += ifmd.ifmd_data.ifi_opackets; stats_all.s1.if_ierrors += ifmd.ifmd_data.ifi_ierrors; stats_all.s1.if_oerrors += ifmd.ifmd_data.ifi_oerrors; stats_all.s1.if_collisions += ifmd.ifmd_data.ifi_collisions; } (void)gettimeofday(&tm, NULL); stats_all.s3.curtime.tv_sec = tm.tv_sec; stats_all.s3.curtime.tv_usec = tm.tv_usec; alarm(1); } /* * returns true if have a disk */ int haveadisk(void) { register int i; struct statinfo stats; int num_devices, retval = 0; if ((num_devices = devstat_getnumdevs(NULL)) < 0) { syslog(LOG_ERR, "rstatd: can't get number of devices: %s", devstat_errbuf); exit(1); } if (devstat_checkversion(NULL) < 0) { syslog(LOG_ERR, "rstatd: %s", devstat_errbuf); exit(1); } stats.dinfo = (struct devinfo *)malloc(sizeof(struct devinfo)); bzero(stats.dinfo, sizeof(struct devinfo)); if (devstat_getdevs(NULL, &stats) == -1) { syslog(LOG_ERR, "rstatd: can't get device list: %s", devstat_errbuf); exit(1); } for (i = 0; i < stats.dinfo->numdevs; i++) { if (((stats.dinfo->devices[i].device_type & DEVSTAT_TYPE_MASK) == DEVSTAT_TYPE_DIRECT) && ((stats.dinfo->devices[i].device_type & DEVSTAT_TYPE_PASS) == 0)) { retval = 1; break; } } if (stats.dinfo->mem_ptr) free(stats.dinfo->mem_ptr); free(stats.dinfo); return(retval); } void updatexfers(int numdevs, int *devs) { register int i, j, k, t; struct statinfo stats; int num_devices = 0; u_int64_t total_transfers; if ((num_devices = devstat_getnumdevs(NULL)) < 0) { syslog(LOG_ERR, "rstatd: can't get number of devices: %s", devstat_errbuf); exit(1); } if (devstat_checkversion(NULL) < 0) { syslog(LOG_ERR, "rstatd: %s", devstat_errbuf); exit(1); } stats.dinfo = (struct devinfo *)malloc(sizeof(struct devinfo)); bzero(stats.dinfo, sizeof(struct devinfo)); if (devstat_getdevs(NULL, &stats) == -1) { syslog(LOG_ERR, "rstatd: can't get device list: %s", devstat_errbuf); exit(1); } for (i = 0, j = 0; i < stats.dinfo->numdevs && j < numdevs; i++) { if (((stats.dinfo->devices[i].device_type & DEVSTAT_TYPE_MASK) == DEVSTAT_TYPE_DIRECT) && ((stats.dinfo->devices[i].device_type & DEVSTAT_TYPE_PASS) == 0)) { total_transfers = 0; for (k = 0; k < DEVSTAT_N_TRANS_FLAGS; k++) total_transfers += stats.dinfo->devices[i].operations[k]; /* * XXX KDM If the total transfers for this device * are greater than the amount we can fit in a * signed integer, just set them to the maximum * amount we can fit in a signed integer. I have a * feeling that the rstat protocol assumes 32-bit * integers, so this could well break on a 64-bit * architecture like the Alpha. */ if (total_transfers > INT_MAX) t = INT_MAX; else t = total_transfers; devs[j] = t; j++; } } if (stats.dinfo->mem_ptr) free(stats.dinfo->mem_ptr); free(stats.dinfo); } void rstat_service(struct svc_req *rqstp, SVCXPRT *transp) { union { int fill; } argument; char *result; bool_t (*xdr_argument)(), (*xdr_result)(); char *(*local)(); switch (rqstp->rq_proc) { case NULLPROC: (void)svc_sendreply(transp, (xdrproc_t)xdr_void, NULL); goto leave; case RSTATPROC_STATS: xdr_argument = xdr_void; xdr_result = xdr_statstime; switch (rqstp->rq_vers) { case RSTATVERS_ORIG: local = (char *(*)()) rstatproc_stats_1_svc; break; case RSTATVERS_SWTCH: local = (char *(*)()) rstatproc_stats_2_svc; break; case RSTATVERS_TIME: local = (char *(*)()) rstatproc_stats_3_svc; break; default: svcerr_progvers(transp, RSTATVERS_ORIG, RSTATVERS_TIME); goto leave; /*NOTREACHED*/ } break; case RSTATPROC_HAVEDISK: xdr_argument = xdr_void; xdr_result = xdr_u_int; switch (rqstp->rq_vers) { case RSTATVERS_ORIG: local = (char *(*)()) rstatproc_havedisk_1_svc; break; case RSTATVERS_SWTCH: local = (char *(*)()) rstatproc_havedisk_2_svc; break; case RSTATVERS_TIME: local = (char *(*)()) rstatproc_havedisk_3_svc; break; default: svcerr_progvers(transp, RSTATVERS_ORIG, RSTATVERS_TIME); goto leave; /*NOTREACHED*/ } break; default: svcerr_noproc(transp); goto leave; } bzero((char *)&argument, sizeof(argument)); if (!svc_getargs(transp, (xdrproc_t)xdr_argument, &argument)) { svcerr_decode(transp); goto leave; } result = (*local)(&argument, rqstp); if (result != NULL && !svc_sendreply(transp, (xdrproc_t)xdr_result, result)) { svcerr_systemerr(transp); } if (!svc_freeargs(transp, (xdrproc_t)xdr_argument, &argument)) errx(1, "unable to free arguments"); leave: if (from_inetd) exit(0); } Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c (revision 317060) +++ head/sys/amd64/amd64/trap.c (revision 317061) @@ -1,938 +1,938 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef KDTRACE_HOOKS #include #endif extern void __noinline trap(struct trapframe *frame); extern void trap_check(struct trapframe *frame); extern void syscall(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; ksiginfo_t ksi; - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Emulator can take care about this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) goto userout; addr = frame->tf_addr; i = trap_pfault(frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); goto userout; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE); goto out; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); goto out; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); goto out; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out; } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; goto out; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; goto out; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gsbase) { frame->tf_rip = (long)gsbase_load_fault; goto out; } if (frame->tf_rip == (long)ld_fsbase) { frame->tf_rip = (long)fsbase_load_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); goto out; #endif /* DEV_ISA */ } trap_fatal(frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n"); printf("rip = 0x%lx\n", frame->tf_rip); printf("rsp = 0x%lx\n", frame->tf_rsp); printf("rbp = 0x%lx\n", frame->tf_rbp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; register_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = 6; params = (caddr_t)frame->tf_rsp + sizeof(register_t); sa->code = frame->tf_rax; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = &frame->tf_rdi; argp += reg; bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); error = copyin(params, &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { struct syscall_args sa; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!TRAPF_USERMODE(td->td_frame)) { panic("syscall"); /* NOT REACHED */ } #endif error = syscallenter(td, &sa); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); KASSERT(td->td_md.md_invl_gen.gen == 0, ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen)); syscallret(td, error, &sa); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } Index: head/sys/amd64/include/atomic.h =================================================================== --- head/sys/amd64/include/atomic.h (revision 317060) +++ head/sys/amd64/include/atomic.h (revision 317061) @@ -1,678 +1,678 @@ /*- * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_ATOMIC_H_ #define _MACHINE_ATOMIC_H_ #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif /* * To express interprocessor (as opposed to processor and device) memory * ordering constraints, use the atomic_*() functions with acquire and release * semantics rather than the *mb() functions. An architecture's memory * ordering (or memory consistency) model governs the order in which a * program's accesses to different locations may be performed by an * implementation of that architecture. In general, for memory regions * defined as writeback cacheable, the memory ordering implemented by amd64 * processors preserves the program ordering of a load followed by a load, a * load followed by a store, and a store followed by a store. Only a store * followed by a load to a different memory location may be reordered. * Therefore, except for special cases, like non-temporal memory accesses or * memory regions defined as write combining, the memory ordering effects * provided by the sfence instruction in the wmb() function and the lfence * instruction in the rmb() function are redundant. In contrast, the * atomic_*() functions with acquire and release semantics do not perform * redundant instructions for ordinary cases of interprocessor memory * ordering on any architecture. */ #define mb() __asm __volatile("mfence;" : : : "memory") #define wmb() __asm __volatile("sfence;" : : : "memory") #define rmb() __asm __volatile("lfence;" : : : "memory") /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. * * atomic_set_char(P, V) (*(u_char *)(P) |= (V)) * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V)) * atomic_add_char(P, V) (*(u_char *)(P) += (V)) * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V)) * * atomic_set_short(P, V) (*(u_short *)(P) |= (V)) * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V)) * atomic_add_short(P, V) (*(u_short *)(P) += (V)) * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V)) * * atomic_set_int(P, V) (*(u_int *)(P) |= (V)) * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V)) * atomic_add_int(P, V) (*(u_int *)(P) += (V)) * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V)) * atomic_swap_int(P, V) (return (*(u_int *)(P)); *(u_int *)(P) = (V);) * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;) * * atomic_set_long(P, V) (*(u_long *)(P) |= (V)) * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V)) * atomic_add_long(P, V) (*(u_long *)(P) += (V)) * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V)) * atomic_swap_long(P, V) (return (*(u_long *)(P)); *(u_long *)(P) = (V);) * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;) */ /* * The above functions are expanded inline in the statically-linked * kernel. Lock prefixes are generated if an SMP kernel is being * built. * * Kernel modules call real functions which are built into the kernel. * This allows kernel modules to be portable between UP and SMP systems. */ #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM) #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \ void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_char(volatile u_char *dst, u_char expect, u_char src); int atomic_cmpset_short(volatile u_short *dst, u_short expect, u_short src); int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src); int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src); int atomic_fcmpset_char(volatile u_char *dst, u_char *expect, u_char src); int atomic_fcmpset_short(volatile u_short *dst, u_short *expect, u_short src); int atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src); int atomic_fcmpset_long(volatile u_long *dst, u_long *expect, u_long src); u_int atomic_fetchadd_int(volatile u_int *p, u_int v); u_long atomic_fetchadd_long(volatile u_long *p, u_long v); int atomic_testandset_int(volatile u_int *p, u_int v); int atomic_testandset_long(volatile u_long *p, u_int v); int atomic_testandclear_int(volatile u_int *p, u_int v); int atomic_testandclear_long(volatile u_long *p, u_int v); void atomic_thread_fence_acq(void); void atomic_thread_fence_acq_rel(void); void atomic_thread_fence_rel(void); void atomic_thread_fence_seq_cst(void); #define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) #else /* !KLD_MODULE && __GNUCLIKE_ASM */ /* * For userland, always use lock prefixes so that the binaries will run * on both SMP and !SMP systems. */ #if defined(SMP) || !defined(_KERNEL) #define MPLOCKED "lock ; " #else #define MPLOCKED #endif /* * The assembly is volatilized to avoid code chunk removal by the compiler. * GCC aggressively reorders operations and memory clobbering is necessary * in order to avoid that for memory barriers. */ #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ static __inline void \ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "cc"); \ } \ \ static __inline void \ atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "memory", "cc"); \ } \ struct __hack /* * Atomic compare and set, used by the mutex functions. * * cmpset: * if (*dst == expect) * *dst = src * * fcmpset: * if (*dst == *expect) * *dst = src * else * *expect = *dst * * Returns 0 on failure, non-zero on success. */ #define ATOMIC_CMPSET(TYPE) \ static __inline int \ atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_cmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (expect) /* 2 */ \ : "r" (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } \ \ static __inline int \ atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_fcmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (*expect) /* 2 */ \ : "r" (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } ATOMIC_CMPSET(char); ATOMIC_CMPSET(short); ATOMIC_CMPSET(int); ATOMIC_CMPSET(long); /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. */ static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int v) { __asm __volatile( " " MPLOCKED " " " xaddl %0,%1 ; " "# atomic_fetchadd_int" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } /* * Atomically add the value of v to the long integer pointed to by p and return * the previous value of *p. */ static __inline u_long atomic_fetchadd_long(volatile u_long *p, u_long v) { __asm __volatile( " " MPLOCKED " " " xaddq %0,%1 ; " "# atomic_fetchadd_long" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } static __inline int atomic_testandset_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btsl %2,%1 ; " " setc %0 ; " "# atomic_testandset_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandset_long(volatile u_long *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btsq %2,%1 ; " " setc %0 ; " "# atomic_testandset_long" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Jr" ((u_long)(v & 0x3f)) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandclear_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btrl %2,%1 ; " " setc %0 ; " "# atomic_testandclear_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandclear_long(volatile u_long *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btrq %2,%1 ; " " setc %0 ; " "# atomic_testandclear_long" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Jr" ((u_long)(v & 0x3f)) /* 2 */ : "cc"); return (res); } /* * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * * However, a load may pass a store if they are performed on distinct * addresses, so we need a Store/Load barrier for sequentially * consistent fences in SMP kernels. We use "lock addl $0,mem" for a * Store/Load barrier, as recommended by the AMD Software Optimization * Guide, and not mfence. To avoid false data dependencies, we use a * special address for "mem". In the kernel, we use a private per-cpu * cache line. In user space, we use a word in the stack's red zone * (-8(%rsp)). * * For UP kernels, however, the memory of the single processor is * always consistent, so we only need to stop the compiler from * reordering accesses in a way that violates the semantics of acquire * and release. */ #if defined(_KERNEL) /* * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). * * The open-coded number is used instead of the symbolic expression to * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. * An assertion in amd64/vm_machdep.c ensures that the value is correct. */ -#define OFFSETOF_MONITORBUF 0x180 +#define OFFSETOF_MONITORBUF 0x100 #if defined(SMP) static __inline void __storeload_barrier(void) { __asm __volatile("lock; addl $0,%%gs:%0" : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc"); } #else /* _KERNEL && UP */ static __inline void __storeload_barrier(void) { __compiler_membar(); } #endif /* SMP */ #else /* !_KERNEL */ static __inline void __storeload_barrier(void) { __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc"); } #endif /* _KERNEL*/ #define ATOMIC_LOAD(TYPE) \ static __inline u_##TYPE \ atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ { \ u_##TYPE res; \ \ res = *p; \ __compiler_membar(); \ return (res); \ } \ struct __hack #define ATOMIC_STORE(TYPE) \ static __inline void \ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ { \ \ __compiler_membar(); \ *p = v; \ } \ struct __hack static __inline void atomic_thread_fence_acq(void) { __compiler_membar(); } static __inline void atomic_thread_fence_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_acq_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_seq_cst(void) { __storeload_barrier(); } #endif /* KLD_MODULE || !__GNUCLIKE_ASM */ ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v); ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v); ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v); ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v); ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v); ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v); ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v); ATOMIC_ASM(set, int, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, int, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v); ATOMIC_ASM(set, long, "orq %1,%0", "ir", v); ATOMIC_ASM(clear, long, "andq %1,%0", "ir", ~v); ATOMIC_ASM(add, long, "addq %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v); #define ATOMIC_LOADSTORE(TYPE) \ ATOMIC_LOAD(TYPE); \ ATOMIC_STORE(TYPE) ATOMIC_LOADSTORE(char); ATOMIC_LOADSTORE(short); ATOMIC_LOADSTORE(int); ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE #undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS /* Read the current value and store a new value in the destination. */ #ifdef __GNUCLIKE_ASM static __inline u_int atomic_swap_int(volatile u_int *p, u_int v) { __asm __volatile( " xchgl %1,%0 ; " "# atomic_swap_int" : "+r" (v), /* 0 */ "+m" (*p)); /* 1 */ return (v); } static __inline u_long atomic_swap_long(volatile u_long *p, u_long v) { __asm __volatile( " xchgq %1,%0 ; " "# atomic_swap_long" : "+r" (v), /* 0 */ "+m" (*p)); /* 1 */ return (v); } #else /* !__GNUCLIKE_ASM */ u_int atomic_swap_int(volatile u_int *p, u_int v); u_long atomic_swap_long(volatile u_long *p, u_long v); #endif /* __GNUCLIKE_ASM */ #define atomic_set_acq_char atomic_set_barr_char #define atomic_set_rel_char atomic_set_barr_char #define atomic_clear_acq_char atomic_clear_barr_char #define atomic_clear_rel_char atomic_clear_barr_char #define atomic_add_acq_char atomic_add_barr_char #define atomic_add_rel_char atomic_add_barr_char #define atomic_subtract_acq_char atomic_subtract_barr_char #define atomic_subtract_rel_char atomic_subtract_barr_char #define atomic_cmpset_acq_char atomic_cmpset_char #define atomic_cmpset_rel_char atomic_cmpset_char #define atomic_fcmpset_acq_char atomic_fcmpset_char #define atomic_fcmpset_rel_char atomic_fcmpset_char #define atomic_set_acq_short atomic_set_barr_short #define atomic_set_rel_short atomic_set_barr_short #define atomic_clear_acq_short atomic_clear_barr_short #define atomic_clear_rel_short atomic_clear_barr_short #define atomic_add_acq_short atomic_add_barr_short #define atomic_add_rel_short atomic_add_barr_short #define atomic_subtract_acq_short atomic_subtract_barr_short #define atomic_subtract_rel_short atomic_subtract_barr_short #define atomic_cmpset_acq_short atomic_cmpset_short #define atomic_cmpset_rel_short atomic_cmpset_short #define atomic_fcmpset_acq_short atomic_fcmpset_short #define atomic_fcmpset_rel_short atomic_fcmpset_short #define atomic_set_acq_int atomic_set_barr_int #define atomic_set_rel_int atomic_set_barr_int #define atomic_clear_acq_int atomic_clear_barr_int #define atomic_clear_rel_int atomic_clear_barr_int #define atomic_add_acq_int atomic_add_barr_int #define atomic_add_rel_int atomic_add_barr_int #define atomic_subtract_acq_int atomic_subtract_barr_int #define atomic_subtract_rel_int atomic_subtract_barr_int #define atomic_cmpset_acq_int atomic_cmpset_int #define atomic_cmpset_rel_int atomic_cmpset_int #define atomic_fcmpset_acq_int atomic_fcmpset_int #define atomic_fcmpset_rel_int atomic_fcmpset_int #define atomic_set_acq_long atomic_set_barr_long #define atomic_set_rel_long atomic_set_barr_long #define atomic_clear_acq_long atomic_clear_barr_long #define atomic_clear_rel_long atomic_clear_barr_long #define atomic_add_acq_long atomic_add_barr_long #define atomic_add_rel_long atomic_add_barr_long #define atomic_subtract_acq_long atomic_subtract_barr_long #define atomic_subtract_rel_long atomic_subtract_barr_long #define atomic_cmpset_acq_long atomic_cmpset_long #define atomic_cmpset_rel_long atomic_cmpset_long #define atomic_fcmpset_acq_long atomic_fcmpset_long #define atomic_fcmpset_rel_long atomic_fcmpset_long #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) /* Operations on 8-bit bytes. */ #define atomic_set_8 atomic_set_char #define atomic_set_acq_8 atomic_set_acq_char #define atomic_set_rel_8 atomic_set_rel_char #define atomic_clear_8 atomic_clear_char #define atomic_clear_acq_8 atomic_clear_acq_char #define atomic_clear_rel_8 atomic_clear_rel_char #define atomic_add_8 atomic_add_char #define atomic_add_acq_8 atomic_add_acq_char #define atomic_add_rel_8 atomic_add_rel_char #define atomic_subtract_8 atomic_subtract_char #define atomic_subtract_acq_8 atomic_subtract_acq_char #define atomic_subtract_rel_8 atomic_subtract_rel_char #define atomic_load_acq_8 atomic_load_acq_char #define atomic_store_rel_8 atomic_store_rel_char #define atomic_cmpset_8 atomic_cmpset_char #define atomic_cmpset_acq_8 atomic_cmpset_acq_char #define atomic_cmpset_rel_8 atomic_cmpset_rel_char #define atomic_fcmpset_8 atomic_fcmpset_char #define atomic_fcmpset_acq_8 atomic_fcmpset_acq_char #define atomic_fcmpset_rel_8 atomic_fcmpset_rel_char /* Operations on 16-bit words. */ #define atomic_set_16 atomic_set_short #define atomic_set_acq_16 atomic_set_acq_short #define atomic_set_rel_16 atomic_set_rel_short #define atomic_clear_16 atomic_clear_short #define atomic_clear_acq_16 atomic_clear_acq_short #define atomic_clear_rel_16 atomic_clear_rel_short #define atomic_add_16 atomic_add_short #define atomic_add_acq_16 atomic_add_acq_short #define atomic_add_rel_16 atomic_add_rel_short #define atomic_subtract_16 atomic_subtract_short #define atomic_subtract_acq_16 atomic_subtract_acq_short #define atomic_subtract_rel_16 atomic_subtract_rel_short #define atomic_load_acq_16 atomic_load_acq_short #define atomic_store_rel_16 atomic_store_rel_short #define atomic_cmpset_16 atomic_cmpset_short #define atomic_cmpset_acq_16 atomic_cmpset_acq_short #define atomic_cmpset_rel_16 atomic_cmpset_rel_short #define atomic_fcmpset_16 atomic_fcmpset_short #define atomic_fcmpset_acq_16 atomic_fcmpset_acq_short #define atomic_fcmpset_rel_16 atomic_fcmpset_rel_short /* Operations on 32-bit double words. */ #define atomic_set_32 atomic_set_int #define atomic_set_acq_32 atomic_set_acq_int #define atomic_set_rel_32 atomic_set_rel_int #define atomic_clear_32 atomic_clear_int #define atomic_clear_acq_32 atomic_clear_acq_int #define atomic_clear_rel_32 atomic_clear_rel_int #define atomic_add_32 atomic_add_int #define atomic_add_acq_32 atomic_add_acq_int #define atomic_add_rel_32 atomic_add_rel_int #define atomic_subtract_32 atomic_subtract_int #define atomic_subtract_acq_32 atomic_subtract_acq_int #define atomic_subtract_rel_32 atomic_subtract_rel_int #define atomic_load_acq_32 atomic_load_acq_int #define atomic_store_rel_32 atomic_store_rel_int #define atomic_cmpset_32 atomic_cmpset_int #define atomic_cmpset_acq_32 atomic_cmpset_acq_int #define atomic_cmpset_rel_32 atomic_cmpset_rel_int #define atomic_fcmpset_32 atomic_fcmpset_int #define atomic_fcmpset_acq_32 atomic_fcmpset_acq_int #define atomic_fcmpset_rel_32 atomic_fcmpset_rel_int #define atomic_swap_32 atomic_swap_int #define atomic_readandclear_32 atomic_readandclear_int #define atomic_fetchadd_32 atomic_fetchadd_int #define atomic_testandset_32 atomic_testandset_int #define atomic_testandclear_32 atomic_testandclear_int /* Operations on 64-bit quad words. */ #define atomic_set_64 atomic_set_long #define atomic_set_acq_64 atomic_set_acq_long #define atomic_set_rel_64 atomic_set_rel_long #define atomic_clear_64 atomic_clear_long #define atomic_clear_acq_64 atomic_clear_acq_long #define atomic_clear_rel_64 atomic_clear_rel_long #define atomic_add_64 atomic_add_long #define atomic_add_acq_64 atomic_add_acq_long #define atomic_add_rel_64 atomic_add_rel_long #define atomic_subtract_64 atomic_subtract_long #define atomic_subtract_acq_64 atomic_subtract_acq_long #define atomic_subtract_rel_64 atomic_subtract_rel_long #define atomic_load_acq_64 atomic_load_acq_long #define atomic_store_rel_64 atomic_store_rel_long #define atomic_cmpset_64 atomic_cmpset_long #define atomic_cmpset_acq_64 atomic_cmpset_acq_long #define atomic_cmpset_rel_64 atomic_cmpset_rel_long #define atomic_fcmpset_64 atomic_fcmpset_long #define atomic_fcmpset_acq_64 atomic_fcmpset_acq_long #define atomic_fcmpset_rel_64 atomic_fcmpset_rel_long #define atomic_swap_64 atomic_swap_long #define atomic_readandclear_64 atomic_readandclear_long #define atomic_fetchadd_64 atomic_fetchadd_long #define atomic_testandset_64 atomic_testandset_long #define atomic_testandclear_64 atomic_testandclear_long /* Operations on pointers. */ #define atomic_set_ptr atomic_set_long #define atomic_set_acq_ptr atomic_set_acq_long #define atomic_set_rel_ptr atomic_set_rel_long #define atomic_clear_ptr atomic_clear_long #define atomic_clear_acq_ptr atomic_clear_acq_long #define atomic_clear_rel_ptr atomic_clear_rel_long #define atomic_add_ptr atomic_add_long #define atomic_add_acq_ptr atomic_add_acq_long #define atomic_add_rel_ptr atomic_add_rel_long #define atomic_subtract_ptr atomic_subtract_long #define atomic_subtract_acq_ptr atomic_subtract_acq_long #define atomic_subtract_rel_ptr atomic_subtract_rel_long #define atomic_load_acq_ptr atomic_load_acq_long #define atomic_store_rel_ptr atomic_store_rel_long #define atomic_cmpset_ptr atomic_cmpset_long #define atomic_cmpset_acq_ptr atomic_cmpset_acq_long #define atomic_cmpset_rel_ptr atomic_cmpset_rel_long #define atomic_fcmpset_ptr atomic_fcmpset_long #define atomic_fcmpset_acq_ptr atomic_fcmpset_acq_long #define atomic_fcmpset_rel_ptr atomic_fcmpset_rel_long #define atomic_swap_ptr atomic_swap_long #define atomic_readandclear_ptr atomic_readandclear_long #endif /* !WANT_FUNCTIONS */ #endif /* !_MACHINE_ATOMIC_H_ */ Index: head/sys/amd64/include/counter.h =================================================================== --- head/sys/amd64/include/counter.h (revision 317060) +++ head/sys/amd64/include/counter.h (revision 317061) @@ -1,89 +1,91 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include -extern struct pcpu __pcpu[1]; +extern struct pcpu __pcpu[]; + +#define EARLY_COUNTER &__pcpu[0].pc_early_dummy_counter #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, i) counter_u64_add(c, i) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { __asm __volatile("addq\t%1,%%gs:(%0)" : : "r" ((char *)c - (char *)&__pcpu[0]), "ri" (inc) : "memory", "cc"); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/amd64/include/pcpu.h =================================================================== --- head/sys/amd64/include/pcpu.h (revision 317060) +++ head/sys/amd64/include/pcpu.h (revision 317061) @@ -1,262 +1,262 @@ /*- * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifndef _SYS_CDEFS_H_ #error "sys/cdefs.h is a prerequisite for this file" #endif /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ #define PCPU_MD_FIELDS \ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ /* Pointer to the CPU %fs descriptor */ \ struct user_segment_descriptor *pc_fs32p; \ /* Pointer to the CPU %gs descriptor */ \ struct user_segment_descriptor *pc_gs32p; \ /* Pointer to the CPU LDT descriptor */ \ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ uint64_t pc_pm_save_cnt; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ - char __pad[145] /* be divisor of PAGE_SIZE \ + char __pad[384] /* be divisor of PAGE_SIZE \ after cache alignment */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 #ifdef _KERNEL #ifdef lint extern struct pcpu *pcpup; #define get_pcpu() (pcpup) #define PCPU_GET(member) (pcpup->pc_ ## member) #define PCPU_ADD(member, val) (pcpup->pc_ ## member += (val)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&pcpup->pc_ ## member) #define PCPU_SET(member, val) (pcpup->pc_ ## member = (val)) #elif defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* * Evaluates to the byte offset of the per-cpu variable name. */ #define __pcpu_offset(name) \ __offsetof(struct pcpu, name) /* * Evaluates to the type of the per-cpu variable name. */ #define __pcpu_type(name) \ __typeof(((struct pcpu *)0)->name) /* * Evaluates to the address of the per-cpu variable name. */ #define __PCPU_PTR(name) __extension__ ({ \ __pcpu_type(name) *__p; \ \ __asm __volatile("movq %%gs:%1,%0; addq %2,%0" \ : "=r" (__p) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ "i" (__pcpu_offset(name))); \ \ __p; \ }) /* * Evaluates to the value of the per-cpu variable name. */ #define __PCPU_GET(name) __extension__ ({ \ __pcpu_type(name) __res; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \ sizeof(__res) == 4 || sizeof(__res) == 8) { \ __asm __volatile("mov %%gs:%1,%0" \ : "=r" (__s) \ : "m" (*(struct __s *)(__pcpu_offset(name)))); \ *(struct __s *)(void *)&__res = __s; \ } else { \ __res = *__PCPU_PTR(name); \ } \ __res; \ }) /* * Adds the value to the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_ADD(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("add %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else \ *__PCPU_PTR(name) += __val; \ } while (0) /* * Increments the value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_INC(name) do { \ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \ sizeof(__pcpu_type(name)) == 2 || \ sizeof(__pcpu_type(name)) == 4 || \ sizeof(__pcpu_type(name)) == 8); \ if (sizeof(__pcpu_type(name)) == 1) { \ __asm __volatile("incb %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 2) { \ __asm __volatile("incw %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 4) { \ __asm __volatile("incl %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 8) { \ __asm __volatile("incq %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } \ } while (0) /* * Sets the value of the per-cpu variable name to value val. */ #define __PCPU_SET(name, val) { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("mov %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else { \ *__PCPU_PTR(name) = __val; \ } \ } #define get_pcpu() __extension__ ({ \ struct pcpu *__pc; \ \ __asm __volatile("movq %%gs:%1,%0" \ : "=r" (__pc) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace)))); \ __pc; \ }) #define PCPU_GET(member) __PCPU_GET(pc_ ## member) #define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val) #define PCPU_INC(member) __PCPU_INC(pc_ ## member) #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) #define OFFSETOF_CURTHREAD 0 #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnull-dereference" #endif static __inline __pure2 struct thread * __curthread(void) { struct thread *td; __asm("movq %%gs:%1,%0" : "=r" (td) : "m" (*(char *)OFFSETOF_CURTHREAD)); return (td); } #ifdef __clang__ #pragma clang diagnostic pop #endif #define curthread (__curthread()) #define OFFSETOF_CURPCB 32 static __inline __pure2 struct pcb * __curpcb(void) { struct pcb *pcb; __asm("movq %%gs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB)); return (pcb); } #define curpcb (__curpcb()) #define IS_BSP() (PCPU_GET(cpuid) == 0) #else /* !lint || defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */ #error "this file needs to be ported to your compiler" #endif /* lint, etc. */ #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/arm/arm/intr.c =================================================================== --- head/sys/arm/arm/intr.c (revision 317060) +++ head/sys/arm/arm/intr.c (revision 317061) @@ -1,202 +1,202 @@ /* $NetBSD: intr.c,v 1.12 2003/07/15 00:24:41 lukem Exp $ */ /*- * Copyright (c) 2004 Olivier Houchard. * Copyright (c) 1994-1998 Mark Brinicombe. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Mark Brinicombe * for the NetBSD Project. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Soft interrupt and other generic interrupt functions. */ #include "opt_platform.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #endif #define INTRNAME_LEN (MAXCOMLEN + 1) typedef void (*mask_fn)(void *); static struct intr_event *intr_events[NIRQ]; void intr_irq_handler(struct trapframe *); void (*arm_post_filter)(void *) = NULL; int (*arm_config_irq)(int irq, enum intr_trigger trig, enum intr_polarity pol) = NULL; /* Data for statistics reporting. */ u_long intrcnt[NIRQ]; char intrnames[(NIRQ * INTRNAME_LEN) + 1]; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); /* * Pre-format intrnames into an array of fixed-size strings containing spaces. * This allows us to avoid the need for an intermediate table of indices into * the names and counts arrays, while still meeting the requirements and * assumptions of vmstat(8) and the kdb "show intrcnt" command, the two * consumers of this data. */ static void intr_init(void *unused) { int i; for (i = 0; i < NIRQ; ++i) { snprintf(&intrnames[i * INTRNAME_LEN], INTRNAME_LEN, "%-*s", INTRNAME_LEN - 1, ""); } } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); #ifdef FDT int intr_fdt_map_irq(phandle_t iparent, pcell_t *intr, int icells) { fdt_pic_decode_t intr_decode; phandle_t intr_parent; int i, rv, interrupt, trig, pol; intr_parent = OF_node_from_xref(iparent); for (i = 0; i < icells; i++) intr[i] = cpu_to_fdt32(intr[i]); for (i = 0; fdt_pic_table[i] != NULL; i++) { intr_decode = fdt_pic_table[i]; rv = intr_decode(intr_parent, intr, &interrupt, &trig, &pol); if (rv == 0) { /* This was recognized as our PIC and decoded. */ interrupt = FDT_MAP_IRQ(intr_parent, interrupt); return (interrupt); } } /* Not in table, so guess */ interrupt = FDT_MAP_IRQ(intr_parent, fdt32_to_cpu(intr[0])); return (interrupt); } #endif void arm_setup_irqhandler(const char *name, driver_filter_t *filt, void (*hand)(void*), void *arg, int irq, int flags, void **cookiep) { struct intr_event *event; int error; if (irq < 0 || irq >= NIRQ) return; event = intr_events[irq]; if (event == NULL) { error = intr_event_create(&event, (void *)irq, 0, irq, (mask_fn)arm_mask_irq, (mask_fn)arm_unmask_irq, arm_post_filter, NULL, "intr%d:", irq); if (error) return; intr_events[irq] = event; snprintf(&intrnames[irq * INTRNAME_LEN], INTRNAME_LEN, "irq%d: %-*s", irq, INTRNAME_LEN - 1, name); } intr_event_add_handler(event, name, filt, hand, arg, intr_priority(flags), flags, cookiep); } int arm_remove_irqhandler(int irq, void *cookie) { struct intr_event *event; int error; event = intr_events[irq]; arm_mask_irq(irq); error = intr_event_remove_handler(cookie); if (!TAILQ_EMPTY(&event->ie_handlers)) arm_unmask_irq(irq); return (error); } void dosoftints(void); void dosoftints(void) { } void intr_irq_handler(struct trapframe *frame) { struct intr_event *event; int i; - PCPU_INC(cnt.v_intr); + VM_CNT_INC(v_intr); i = -1; while ((i = arm_get_next_irq(i)) != -1) { intrcnt[i]++; event = intr_events[i]; if (intr_event_handle(event, frame) != 0) { /* XXX: Log stray IRQs */ arm_mask_irq(i); } } #ifdef HWPMC_HOOKS if (pmc_hook && (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN)) pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, frame); #endif } Index: head/sys/arm/arm/trap-v4.c =================================================================== --- head/sys/arm/arm/trap-v4.c (revision 317060) +++ head/sys/arm/arm/trap-v4.c (revision 317061) @@ -1,729 +1,729 @@ /* $NetBSD: fault.c,v 1.45 2003/11/20 14:44:36 scw Exp $ */ /*- * Copyright 2004 Olivier Houchard * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1994-1997 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * fault.c * * Fault handlers * * Created : 28/11/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #ifdef KDTRACE_HOOKS #include #endif #define ReadWord(a) (*((volatile unsigned int *)(a))) #ifdef DEBUG int last_fault_code; /* For the benefit of pmap_fault_fixup() */ #endif struct ksig { int signb; u_long code; }; struct data_abort { int (*func)(struct trapframe *, u_int, u_int, struct thread *, struct ksig *); const char *desc; }; static int dab_fatal(struct trapframe *, u_int, u_int, struct thread *, struct ksig *); static int dab_align(struct trapframe *, u_int, u_int, struct thread *, struct ksig *); static int dab_buserr(struct trapframe *, u_int, u_int, struct thread *, struct ksig *); static void prefetch_abort_handler(struct trapframe *); static const struct data_abort data_aborts[] = { {dab_fatal, "Vector Exception"}, {dab_align, "Alignment Fault 1"}, {dab_fatal, "Terminal Exception"}, {dab_align, "Alignment Fault 3"}, {dab_buserr, "External Linefetch Abort (S)"}, {NULL, "Translation Fault (S)"}, {dab_buserr, "External Linefetch Abort (P)"}, {NULL, "Translation Fault (P)"}, {dab_buserr, "External Non-Linefetch Abort (S)"}, {NULL, "Domain Fault (S)"}, {dab_buserr, "External Non-Linefetch Abort (P)"}, {NULL, "Domain Fault (P)"}, {dab_buserr, "External Translation Abort (L1)"}, {NULL, "Permission Fault (S)"}, {dab_buserr, "External Translation Abort (L2)"}, {NULL, "Permission Fault (P)"} }; /* Determine if a fault came from user mode */ #define TRAP_USERMODE(tf) ((tf->tf_spsr & PSR_MODE) == PSR_USR32_MODE) /* Determine if 'x' is a permission fault */ #define IS_PERMISSION_FAULT(x) \ (((1 << ((x) & FAULT_TYPE_MASK)) & \ ((1 << FAULT_PERM_P) | (1 << FAULT_PERM_S))) != 0) static __inline void call_trapsignal(struct thread *td, int sig, u_long code) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int)code; trapsignal(td, &ksi); } void abort_handler(struct trapframe *tf, int type) { struct vm_map *map; struct pcb *pcb; struct thread *td; u_int user, far, fsr; vm_prot_t ftype; void *onfault; vm_offset_t va; int error = 0; struct ksig ksig; struct proc *p; if (type == 1) return (prefetch_abort_handler(tf)); /* Grab FAR/FSR before enabling interrupts */ far = cpu_faultaddress(); fsr = cpu_faultstatus(); #if 0 printf("data abort: fault address=%p (from pc=%p lr=%p)\n", (void*)far, (void*)tf->tf_pc, (void*)tf->tf_svc_lr); #endif /* Update vmmeter statistics */ #if 0 vmexp.traps++; #endif td = curthread; p = td->td_proc; - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); /* Data abort came from user mode? */ user = TRAP_USERMODE(tf); if (user) { td->td_pticks = 0; td->td_frame = tf; if (td->td_cowgen != td->td_proc->p_cowgen) thread_cow_update(td); } /* Grab the current pcb */ pcb = td->td_pcb; /* Re-enable interrupts if they were enabled previously */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(tf->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } /* Invoke the appropriate handler, if necessary */ if (__predict_false(data_aborts[fsr & FAULT_TYPE_MASK].func != NULL)) { if ((data_aborts[fsr & FAULT_TYPE_MASK].func)(tf, fsr, far, td, &ksig)) { goto do_trapsignal; } goto out; } /* * At this point, we're dealing with one of the following data aborts: * * FAULT_TRANS_S - Translation -- Section * FAULT_TRANS_P - Translation -- Page * FAULT_DOMAIN_S - Domain -- Section * FAULT_DOMAIN_P - Domain -- Page * FAULT_PERM_S - Permission -- Section * FAULT_PERM_P - Permission -- Page * * These are the main virtual memory-related faults signalled by * the MMU. */ /* * Make sure the Program Counter is sane. We could fall foul of * someone executing Thumb code, in which case the PC might not * be word-aligned. This would cause a kernel alignment fault * further down if we have to decode the current instruction. * XXX: It would be nice to be able to support Thumb at some point. */ if (__predict_false((tf->tf_pc & 3) != 0)) { if (user) { /* * Give the user an illegal instruction signal. */ /* Deliver a SIGILL to the process */ ksig.signb = SIGILL; ksig.code = 0; goto do_trapsignal; } /* * The kernel never executes Thumb code. */ printf("\ndata_abort_fault: Misaligned Kernel-mode " "Program Counter\n"); dab_fatal(tf, fsr, far, td, &ksig); } va = trunc_page((vm_offset_t)far); /* * It is only a kernel address space fault iff: * 1. user == 0 and * 2. pcb_onfault not set or * 3. pcb_onfault set and not LDRT/LDRBT/STRT/STRBT instruction. */ if (user == 0 && (va >= VM_MIN_KERNEL_ADDRESS || (va < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW)) && __predict_true((pcb->pcb_onfault == NULL || (ReadWord(tf->tf_pc) & 0x05200000) != 0x04200000))) { map = kernel_map; /* Was the fault due to the FPE/IPKDB ? */ if (__predict_false((tf->tf_spsr & PSR_MODE)==PSR_UND32_MODE)) { /* * Force exit via userret() * This is necessary as the FPE is an extension to * userland that actually runs in a priveledged mode * but uses USR mode permissions for its accesses. */ user = 1; ksig.signb = SIGSEGV; ksig.code = 0; goto do_trapsignal; } } else { map = &td->td_proc->p_vmspace->vm_map; } /* * We need to know whether the page should be mapped as R or R/W. * On armv4, the fault status register does not indicate whether * the access was a read or write. We know that a permission fault * can only be the result of a write to a read-only location, so we * can deal with those quickly. Otherwise we need to disassemble * the faulting instruction to determine if it was a write. */ if (IS_PERMISSION_FAULT(fsr)) ftype = VM_PROT_WRITE; else { u_int insn = ReadWord(tf->tf_pc); if (((insn & 0x0c100000) == 0x04000000) || /* STR/STRB */ ((insn & 0x0e1000b0) == 0x000000b0) || /* STRH/STRD */ ((insn & 0x0a100000) == 0x08000000)) { /* STM/CDT */ ftype = VM_PROT_WRITE; } else { if ((insn & 0x0fb00ff0) == 0x01000090) /* SWP */ ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; } } /* * See if the fault is as a result of ref/mod emulation, * or domain mismatch. */ #ifdef DEBUG last_fault_code = fsr; #endif if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) goto fatal_pagefault; if (pmap_fault_fixup(vmspace_pmap(td->td_proc->p_vmspace), va, ftype, user)) { goto out; } onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); pcb->pcb_onfault = onfault; if (__predict_true(error == 0)) goto out; fatal_pagefault: if (user == 0) { if (pcb->pcb_onfault) { tf->tf_r0 = error; tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault; return; } printf("\nvm_fault(%p, %x, %x, 0) -> %x\n", map, va, ftype, error); dab_fatal(tf, fsr, far, td, &ksig); } if (error == ENOMEM) { printf("VM: pid %d (%s), uid %d killed: " "out of swap\n", td->td_proc->p_pid, td->td_name, (td->td_proc->p_ucred) ? td->td_proc->p_ucred->cr_uid : -1); ksig.signb = SIGKILL; } else { ksig.signb = SIGSEGV; } ksig.code = 0; do_trapsignal: call_trapsignal(td, ksig.signb, ksig.code); out: /* If returning to user mode, make sure to invoke userret() */ if (user) userret(td, tf); } /* * dab_fatal() handles the following data aborts: * * FAULT_WRTBUF_0 - Vector Exception * FAULT_WRTBUF_1 - Terminal Exception * * We should never see these on a properly functioning system. * * This function is also called by the other handlers if they * detect a fatal problem. * * Note: If 'l' is NULL, we assume we're dealing with a prefetch abort. */ static int dab_fatal(struct trapframe *tf, u_int fsr, u_int far, struct thread *td, struct ksig *ksig) { const char *mode; #ifdef KDTRACE_HOOKS if (!TRAP_USERMODE(tf)) { if (dtrace_trap_func != NULL && (*dtrace_trap_func)(tf, far & FAULT_TYPE_MASK)) return (0); } #endif mode = TRAP_USERMODE(tf) ? "user" : "kernel"; disable_interrupts(PSR_I|PSR_F); if (td != NULL) { printf("Fatal %s mode data abort: '%s'\n", mode, data_aborts[fsr & FAULT_TYPE_MASK].desc); printf("trapframe: %p\nFSR=%08x, FAR=", tf, fsr); if ((fsr & FAULT_IMPRECISE) == 0) printf("%08x, ", far); else printf("Invalid, "); printf("spsr=%08x\n", tf->tf_spsr); } else { printf("Fatal %s mode prefetch abort at 0x%08x\n", mode, tf->tf_pc); printf("trapframe: %p, spsr=%08x\n", tf, tf->tf_spsr); } printf("r0 =%08x, r1 =%08x, r2 =%08x, r3 =%08x\n", tf->tf_r0, tf->tf_r1, tf->tf_r2, tf->tf_r3); printf("r4 =%08x, r5 =%08x, r6 =%08x, r7 =%08x\n", tf->tf_r4, tf->tf_r5, tf->tf_r6, tf->tf_r7); printf("r8 =%08x, r9 =%08x, r10=%08x, r11=%08x\n", tf->tf_r8, tf->tf_r9, tf->tf_r10, tf->tf_r11); printf("r12=%08x, ", tf->tf_r12); if (TRAP_USERMODE(tf)) printf("usp=%08x, ulr=%08x", tf->tf_usr_sp, tf->tf_usr_lr); else printf("ssp=%08x, slr=%08x", tf->tf_svc_sp, tf->tf_svc_lr); printf(", pc =%08x\n\n", tf->tf_pc); #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(fsr, 0, tf)) return (0); #endif panic("Fatal abort"); /*NOTREACHED*/ } /* * dab_align() handles the following data aborts: * * FAULT_ALIGN_0 - Alignment fault * FAULT_ALIGN_1 - Alignment fault * * These faults are fatal if they happen in kernel mode. Otherwise, we * deliver a bus error to the process. */ static int dab_align(struct trapframe *tf, u_int fsr, u_int far, struct thread *td, struct ksig *ksig) { /* Alignment faults are always fatal if they occur in kernel mode */ if (!TRAP_USERMODE(tf)) { if (!td || !td->td_pcb->pcb_onfault) dab_fatal(tf, fsr, far, td, ksig); tf->tf_r0 = EFAULT; tf->tf_pc = (int)td->td_pcb->pcb_onfault; return (0); } /* pcb_onfault *must* be NULL at this point */ /* Deliver a bus error signal to the process */ ksig->code = 0; ksig->signb = SIGBUS; td->td_frame = tf; return (1); } /* * dab_buserr() handles the following data aborts: * * FAULT_BUSERR_0 - External Abort on Linefetch -- Section * FAULT_BUSERR_1 - External Abort on Linefetch -- Page * FAULT_BUSERR_2 - External Abort on Non-linefetch -- Section * FAULT_BUSERR_3 - External Abort on Non-linefetch -- Page * FAULT_BUSTRNL1 - External abort on Translation -- Level 1 * FAULT_BUSTRNL2 - External abort on Translation -- Level 2 * * If pcb_onfault is set, flag the fault and return to the handler. * If the fault occurred in user mode, give the process a SIGBUS. * * Note: On XScale, FAULT_BUSERR_0, FAULT_BUSERR_1, and FAULT_BUSERR_2 * can be flagged as imprecise in the FSR. This causes a real headache * since some of the machine state is lost. In this case, tf->tf_pc * may not actually point to the offending instruction. In fact, if * we've taken a double abort fault, it generally points somewhere near * the top of "data_abort_entry" in exception.S. * * In all other cases, these data aborts are considered fatal. */ static int dab_buserr(struct trapframe *tf, u_int fsr, u_int far, struct thread *td, struct ksig *ksig) { struct pcb *pcb = td->td_pcb; #ifdef __XSCALE__ if ((fsr & FAULT_IMPRECISE) != 0 && (tf->tf_spsr & PSR_MODE) == PSR_ABT32_MODE) { /* * Oops, an imprecise, double abort fault. We've lost the * r14_abt/spsr_abt values corresponding to the original * abort, and the spsr saved in the trapframe indicates * ABT mode. */ tf->tf_spsr &= ~PSR_MODE; /* * We use a simple heuristic to determine if the double abort * happened as a result of a kernel or user mode access. * If the current trapframe is at the top of the kernel stack, * the fault _must_ have come from user mode. */ if (tf != ((struct trapframe *)pcb->pcb_regs.sf_sp) - 1) { /* * Kernel mode. We're either about to die a * spectacular death, or pcb_onfault will come * to our rescue. Either way, the current value * of tf->tf_pc is irrelevant. */ tf->tf_spsr |= PSR_SVC32_MODE; if (pcb->pcb_onfault == NULL) printf("\nKernel mode double abort!\n"); } else { /* * User mode. We've lost the program counter at the * time of the fault (not that it was accurate anyway; * it's not called an imprecise fault for nothing). * About all we can do is copy r14_usr to tf_pc and * hope for the best. The process is about to get a * SIGBUS, so it's probably history anyway. */ tf->tf_spsr |= PSR_USR32_MODE; tf->tf_pc = tf->tf_usr_lr; } } /* FAR is invalid for imprecise exceptions */ if ((fsr & FAULT_IMPRECISE) != 0) far = 0; #endif /* __XSCALE__ */ if (pcb->pcb_onfault) { tf->tf_r0 = EFAULT; tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault; return (0); } /* * At this point, if the fault happened in kernel mode, we're toast */ if (!TRAP_USERMODE(tf)) dab_fatal(tf, fsr, far, td, ksig); /* Deliver a bus error signal to the process */ ksig->signb = SIGBUS; ksig->code = 0; td->td_frame = tf; return (1); } /* * void prefetch_abort_handler(struct trapframe *tf) * * Abort handler called when instruction execution occurs at * a non existent or restricted (access permissions) memory page. * If the address is invalid and we were in SVC mode then panic as * the kernel should never prefetch abort. * If the address is invalid and the page is mapped then the user process * does no have read permission so send it a signal. * Otherwise fault the page in and try again. */ static void prefetch_abort_handler(struct trapframe *tf) { struct thread *td; struct proc * p; struct vm_map *map; vm_offset_t fault_pc, va; int error = 0; struct ksig ksig; #if 0 /* Update vmmeter statistics */ uvmexp.traps++; #endif #if 0 printf("prefetch abort handler: %p %p\n", (void*)tf->tf_pc, (void*)tf->tf_usr_lr); #endif td = curthread; p = td->td_proc; - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); if (TRAP_USERMODE(tf)) { td->td_frame = tf; if (td->td_cowgen != td->td_proc->p_cowgen) thread_cow_update(td); } fault_pc = tf->tf_pc; if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(tf->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } /* Prefetch aborts cannot happen in kernel mode */ if (__predict_false(!TRAP_USERMODE(tf))) dab_fatal(tf, 0, tf->tf_pc, NULL, &ksig); td->td_pticks = 0; /* Ok validate the address, can only execute in USER space */ if (__predict_false(fault_pc >= VM_MAXUSER_ADDRESS || (fault_pc < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW))) { ksig.signb = SIGSEGV; ksig.code = 0; goto do_trapsignal; } map = &td->td_proc->p_vmspace->vm_map; va = trunc_page(fault_pc); /* * See if the pmap can handle this fault on its own... */ #ifdef DEBUG last_fault_code = -1; #endif if (pmap_fault_fixup(map->pmap, va, VM_PROT_READ, 1)) goto out; error = vm_fault(map, va, VM_PROT_READ | VM_PROT_EXECUTE, VM_FAULT_NORMAL); if (__predict_true(error == 0)) goto out; if (error == ENOMEM) { printf("VM: pid %d (%s), uid %d killed: " "out of swap\n", td->td_proc->p_pid, td->td_name, (td->td_proc->p_ucred) ? td->td_proc->p_ucred->cr_uid : -1); ksig.signb = SIGKILL; } else { ksig.signb = SIGSEGV; } ksig.code = 0; do_trapsignal: call_trapsignal(td, ksig.signb, ksig.code); out: userret(td, tf); } extern int badaddr_read_1(const uint8_t *, uint8_t *); extern int badaddr_read_2(const uint16_t *, uint16_t *); extern int badaddr_read_4(const uint32_t *, uint32_t *); /* * Tentatively read an 8, 16, or 32-bit value from 'addr'. * If the read succeeds, the value is written to 'rptr' and zero is returned. * Else, return EFAULT. */ int badaddr_read(void *addr, size_t size, void *rptr) { union { uint8_t v1; uint16_t v2; uint32_t v4; } u; int rv; cpu_drain_writebuf(); /* Read from the test address. */ switch (size) { case sizeof(uint8_t): rv = badaddr_read_1(addr, &u.v1); if (rv == 0 && rptr) *(uint8_t *) rptr = u.v1; break; case sizeof(uint16_t): rv = badaddr_read_2(addr, &u.v2); if (rv == 0 && rptr) *(uint16_t *) rptr = u.v2; break; case sizeof(uint32_t): rv = badaddr_read_4(addr, &u.v4); if (rv == 0 && rptr) *(uint32_t *) rptr = u.v4; break; default: panic("badaddr: invalid size (%lu)", (u_long) size); } /* Return EFAULT if the address was invalid, else zero */ return (rv); } Index: head/sys/arm/arm/trap-v6.c =================================================================== --- head/sys/arm/arm/trap-v6.c (revision 317060) +++ head/sys/arm/arm/trap-v6.c (revision 317061) @@ -1,647 +1,647 @@ /*- * Copyright 2014 Olivier Houchard * Copyright 2014 Svatopluk Kraus * Copyright 2014 Michal Meloun * Copyright 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ktrace.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #include #endif #ifdef KDTRACE_HOOKS #include #endif extern char cachebailout[]; #ifdef DEBUG int last_fault_code; /* For the benefit of pmap_fault_fixup() */ #endif struct ksig { int sig; u_long code; vm_offset_t addr; }; typedef int abort_func_t(struct trapframe *, u_int, u_int, u_int, u_int, struct thread *, struct ksig *); static abort_func_t abort_fatal; static abort_func_t abort_align; static abort_func_t abort_icache; struct abort { abort_func_t *func; const char *desc; }; /* * How are the aborts handled? * * Undefined Code: * - Always fatal as we do not know what does it mean. * Imprecise External Abort: * - Always fatal, but can be handled somehow in the future. * Now, due to PCIe buggy hardware, ignored. * Precise External Abort: * - Always fatal, but who knows in the future??? * Debug Event: * - Special handling. * External Translation Abort (L1 & L2) * - Always fatal as something is screwed up in page tables or hardware. * Domain Fault (L1 & L2): * - Always fatal as we do not play game with domains. * Alignment Fault: * - Everything should be aligned in kernel with exception of user to kernel * and vice versa data copying, so if pcb_onfault is not set, it's fatal. * We generate signal in case of abort from user mode. * Instruction cache maintenance: * - According to manual, this is translation fault during cache maintenance * operation. So, it could be really complex in SMP case and fuzzy too * for cache operations working on virtual addresses. For now, we will * consider this abort as fatal. In fact, no cache maintenance on * not mapped virtual addresses should be called. As cache maintenance * operation (except DMB, DSB, and Flush Prefetch Buffer) are priviledged, * the abort is fatal for user mode as well for now. (This is good place to * note that cache maintenance on virtual address fill TLB.) * Acces Bit (L1 & L2): * - Fast hardware emulation for kernel and user mode. * Translation Fault (L1 & L2): * - Standard fault mechanism is held including vm_fault(). * Permission Fault (L1 & L2): * - Fast hardware emulation of modify bits and in other cases, standard * fault mechanism is held including vm_fault(). */ static const struct abort aborts[] = { {abort_fatal, "Undefined Code (0x000)"}, {abort_align, "Alignment Fault"}, {abort_fatal, "Debug Event"}, {NULL, "Access Bit (L1)"}, {NULL, "Instruction cache maintenance"}, {NULL, "Translation Fault (L1)"}, {NULL, "Access Bit (L2)"}, {NULL, "Translation Fault (L2)"}, {abort_fatal, "External Abort"}, {abort_fatal, "Domain Fault (L1)"}, {abort_fatal, "Undefined Code (0x00A)"}, {abort_fatal, "Domain Fault (L2)"}, {abort_fatal, "External Translation Abort (L1)"}, {NULL, "Permission Fault (L1)"}, {abort_fatal, "External Translation Abort (L2)"}, {NULL, "Permission Fault (L2)"}, {abort_fatal, "TLB Conflict Abort"}, {abort_fatal, "Undefined Code (0x401)"}, {abort_fatal, "Undefined Code (0x402)"}, {abort_fatal, "Undefined Code (0x403)"}, {abort_fatal, "Undefined Code (0x404)"}, {abort_fatal, "Undefined Code (0x405)"}, {abort_fatal, "Asynchronous External Abort"}, {abort_fatal, "Undefined Code (0x407)"}, {abort_fatal, "Asynchronous Parity Error on Memory Access"}, {abort_fatal, "Parity Error on Memory Access"}, {abort_fatal, "Undefined Code (0x40A)"}, {abort_fatal, "Undefined Code (0x40B)"}, {abort_fatal, "Parity Error on Translation (L1)"}, {abort_fatal, "Undefined Code (0x40D)"}, {abort_fatal, "Parity Error on Translation (L2)"}, {abort_fatal, "Undefined Code (0x40F)"} }; static __inline void call_trapsignal(struct thread *td, int sig, int code, vm_offset_t addr) { ksiginfo_t ksi; CTR4(KTR_TRAP, "%s: addr: %#x, sig: %d, code: %d", __func__, addr, sig, code); /* * TODO: some info would be nice to know * if we are serving data or prefetch abort. */ ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = (void *)addr; trapsignal(td, &ksi); } /* * abort_imprecise() handles the following abort: * * FAULT_EA_IMPREC - Imprecise External Abort * * The imprecise means that we don't know where the abort happened, * thus FAR is undefined. The abort should not never fire, but hot * plugging or accidental hardware failure can be the cause of it. * If the abort happens, it can even be on different (thread) context. * Without any additional support, the abort is fatal, as we do not * know what really happened. * * QQQ: Some additional functionality, like pcb_onfault but global, * can be implemented. Imprecise handlers could be registered * which tell us if the abort is caused by something they know * about. They should return one of three codes like: * FAULT_IS_MINE, * FAULT_CAN_BE_MINE, * FAULT_IS_NOT_MINE. * The handlers should be called until some of them returns * FAULT_IS_MINE value or all was called. If all handlers return * FAULT_IS_NOT_MINE value, then the abort is fatal. */ static __inline void abort_imprecise(struct trapframe *tf, u_int fsr, u_int prefetch, bool usermode) { /* * XXX - We can got imprecise abort as result of access * to not-present PCI/PCIe configuration space. */ #if 0 goto out; #endif abort_fatal(tf, FAULT_EA_IMPREC, fsr, 0, prefetch, curthread, NULL); /* * Returning from this function means that we ignore * the abort for good reason. Note that imprecise abort * could fire any time even in user mode. */ #if 0 out: if (usermode) userret(curthread, tf); #endif } /* * abort_debug() handles the following abort: * * FAULT_DEBUG - Debug Event * */ static __inline void abort_debug(struct trapframe *tf, u_int fsr, u_int prefetch, bool usermode, u_int far) { if (usermode) { struct thread *td; td = curthread; call_trapsignal(td, SIGTRAP, TRAP_BRKPT, far); userret(td, tf); } else { #ifdef KDB kdb_trap((prefetch) ? T_BREAKPOINT : T_WATCHPOINT, 0, tf); #else printf("No debugger in kernel.\n"); #endif } } /* * Abort handler. * * FAR, FSR, and everything what can be lost after enabling * interrupts must be grabbed before the interrupts will be * enabled. Note that when interrupts will be enabled, we * could even migrate to another CPU ... * * TODO: move quick cases to ASM */ void abort_handler(struct trapframe *tf, int prefetch) { struct thread *td; vm_offset_t far, va; int idx, rv; uint32_t fsr; struct ksig ksig; struct proc *p; struct pcb *pcb; struct vm_map *map; struct vmspace *vm; vm_prot_t ftype; bool usermode; #ifdef INVARIANTS void *onfault; #endif - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); td = curthread; fsr = (prefetch) ? cp15_ifsr_get(): cp15_dfsr_get(); #if __ARM_ARCH >= 7 far = (prefetch) ? cp15_ifar_get() : cp15_dfar_get(); #else far = (prefetch) ? TRAPF_PC(tf) : cp15_dfar_get(); #endif idx = FSR_TO_FAULT(fsr); usermode = TRAPF_USERMODE(tf); /* Abort came from user mode? */ if (usermode) td->td_frame = tf; CTR6(KTR_TRAP, "%s: fsr %#x (idx %u) far %#x prefetch %u usermode %d", __func__, fsr, idx, far, prefetch, usermode); /* * Firstly, handle aborts that are not directly related to mapping. */ if (__predict_false(idx == FAULT_EA_IMPREC)) { abort_imprecise(tf, fsr, prefetch, usermode); return; } if (__predict_false(idx == FAULT_DEBUG)) { abort_debug(tf, fsr, prefetch, usermode, far); return; } /* * ARM has a set of unprivileged load and store instructions * (LDRT/LDRBT/STRT/STRBT ...) which are supposed to be used in other * than user mode and OS should recognize their aborts and behave * appropriately. However, there is no way how to do that reasonably * in general unless we restrict the handling somehow. * * For now, these instructions are used only in copyin()/copyout() * like functions where usermode buffers are checked in advance that * they are not from KVA space. Thus, no action is needed here. */ /* * (1) Handle access and R/W hardware emulation aborts. * (2) Check that abort is not on pmap essential address ranges. * There is no way how to fix it, so we don't even try. */ rv = pmap_fault(PCPU_GET(curpmap), far, fsr, idx, usermode); if (rv == KERN_SUCCESS) return; #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (rv == KERN_INVALID_ADDRESS) goto nogo; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != far || (td->td_pflags & TDP_RESETSPUR) != 0) { td->td_md.md_spurflt_addr = far; td->td_pflags &= ~TDP_RESETSPUR; tlb_flush_local(far & ~PAGE_MASK); return; } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } } /* Re-enable interrupts if they were enabled previously. */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(tf->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } p = td->td_proc; if (usermode) { td->td_pticks = 0; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); } /* Invoke the appropriate handler, if necessary. */ if (__predict_false(aborts[idx].func != NULL)) { if ((aborts[idx].func)(tf, idx, fsr, far, prefetch, td, &ksig)) goto do_trapsignal; goto out; } /* * At this point, we're dealing with one of the following aborts: * * FAULT_ICACHE - I-cache maintenance * FAULT_TRAN_xx - Translation * FAULT_PERM_xx - Permission */ /* * Don't pass faulting cache operation to vm_fault(). We don't want * to handle all vm stuff at this moment. */ pcb = td->td_pcb; if (__predict_false(pcb->pcb_onfault == cachebailout)) { tf->tf_r0 = far; /* return failing address */ tf->tf_pc = (register_t)pcb->pcb_onfault; return; } /* Handle remaining I-cache aborts. */ if (idx == FAULT_ICACHE) { if (abort_icache(tf, idx, fsr, far, prefetch, td, &ksig)) goto do_trapsignal; goto out; } va = trunc_page(far); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. If curproc * is NULL or curproc->p_vmspace is NULL the fault is fatal. */ vm = (p != NULL) ? p->p_vmspace : NULL; if (vm == NULL) goto nogo; map = &vm->vm_map; if (!usermode && (td->td_intr_nesting_level != 0 || pcb->pcb_onfault == NULL)) { abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } } ftype = (fsr & FSR_WNR) ? VM_PROT_WRITE : VM_PROT_READ; if (prefetch) ftype |= VM_PROT_EXECUTE; #ifdef DEBUG last_fault_code = fsr; #endif #ifdef INVARIANTS onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; #endif /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); #ifdef INVARIANTS pcb->pcb_onfault = onfault; #endif if (__predict_true(rv == KERN_SUCCESS)) goto out; nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != NULL) { tf->tf_r0 = rv; tf->tf_pc = (int)pcb->pcb_onfault; return; } CTR2(KTR_TRAP, "%s: vm_fault() failed with %d", __func__, rv); abort_fatal(tf, idx, fsr, far, prefetch, td, &ksig); return; } ksig.sig = SIGSEGV; ksig.code = (rv == KERN_PROTECTION_FAILURE) ? SEGV_ACCERR : SEGV_MAPERR; ksig.addr = far; do_trapsignal: call_trapsignal(td, ksig.sig, ksig.code, ksig.addr); out: if (usermode) userret(td, tf); } /* * abort_fatal() handles the following data aborts: * * FAULT_DEBUG - Debug Event * FAULT_ACCESS_xx - Acces Bit * FAULT_EA_PREC - Precise External Abort * FAULT_DOMAIN_xx - Domain Fault * FAULT_EA_TRAN_xx - External Translation Abort * FAULT_EA_IMPREC - Imprecise External Abort * + all undefined codes for ABORT * * We should never see these on a properly functioning system. * * This function is also called by the other handlers if they * detect a fatal problem. * * Note: If 'l' is NULL, we assume we're dealing with a prefetch abort. */ static int abort_fatal(struct trapframe *tf, u_int idx, u_int fsr, u_int far, u_int prefetch, struct thread *td, struct ksig *ksig) { bool usermode; const char *mode; const char *rw_mode; usermode = TRAPF_USERMODE(tf); #ifdef KDTRACE_HOOKS if (!usermode) { if (dtrace_trap_func != NULL && (*dtrace_trap_func)(tf, far)) return (0); } #endif mode = usermode ? "user" : "kernel"; rw_mode = fsr & FSR_WNR ? "write" : "read"; disable_interrupts(PSR_I|PSR_F); if (td != NULL) { printf("Fatal %s mode data abort: '%s' on %s\n", mode, aborts[idx].desc, rw_mode); printf("trapframe: %p\nFSR=%08x, FAR=", tf, fsr); if (idx != FAULT_EA_IMPREC) printf("%08x, ", far); else printf("Invalid, "); printf("spsr=%08x\n", tf->tf_spsr); } else { printf("Fatal %s mode prefetch abort at 0x%08x\n", mode, tf->tf_pc); printf("trapframe: %p, spsr=%08x\n", tf, tf->tf_spsr); } printf("r0 =%08x, r1 =%08x, r2 =%08x, r3 =%08x\n", tf->tf_r0, tf->tf_r1, tf->tf_r2, tf->tf_r3); printf("r4 =%08x, r5 =%08x, r6 =%08x, r7 =%08x\n", tf->tf_r4, tf->tf_r5, tf->tf_r6, tf->tf_r7); printf("r8 =%08x, r9 =%08x, r10=%08x, r11=%08x\n", tf->tf_r8, tf->tf_r9, tf->tf_r10, tf->tf_r11); printf("r12=%08x, ", tf->tf_r12); if (usermode) printf("usp=%08x, ulr=%08x", tf->tf_usr_sp, tf->tf_usr_lr); else printf("ssp=%08x, slr=%08x", tf->tf_svc_sp, tf->tf_svc_lr); printf(", pc =%08x\n\n", tf->tf_pc); #ifdef KDB if (debugger_on_panic || kdb_active) kdb_trap(fsr, 0, tf); #endif panic("Fatal abort"); /*NOTREACHED*/ } /* * abort_align() handles the following data abort: * * FAULT_ALIGN - Alignment fault * * Everything should be aligned in kernel with exception of user to kernel * and vice versa data copying, so if pcb_onfault is not set, it's fatal. * We generate signal in case of abort from user mode. */ static int abort_align(struct trapframe *tf, u_int idx, u_int fsr, u_int far, u_int prefetch, struct thread *td, struct ksig *ksig) { bool usermode; usermode = TRAPF_USERMODE(tf); if (!usermode) { if (td->td_intr_nesting_level == 0 && td != NULL && td->td_pcb->pcb_onfault != NULL) { tf->tf_r0 = EFAULT; tf->tf_pc = (int)td->td_pcb->pcb_onfault; return (0); } abort_fatal(tf, idx, fsr, far, prefetch, td, ksig); } /* Deliver a bus error signal to the process */ ksig->code = BUS_ADRALN; ksig->sig = SIGBUS; ksig->addr = far; return (1); } /* * abort_icache() handles the following data abort: * * FAULT_ICACHE - Instruction cache maintenance * * According to manual, FAULT_ICACHE is translation fault during cache * maintenance operation. In fact, no cache maintenance operation on * not mapped virtual addresses should be called. As cache maintenance * operation (except DMB, DSB, and Flush Prefetch Buffer) are priviledged, * the abort is concider as fatal for now. However, all the matter with * cache maintenance operation on virtual addresses could be really complex * and fuzzy in SMP case, so maybe in future standard fault mechanism * should be held here including vm_fault() calling. */ static int abort_icache(struct trapframe *tf, u_int idx, u_int fsr, u_int far, u_int prefetch, struct thread *td, struct ksig *ksig) { abort_fatal(tf, idx, fsr, far, prefetch, td, ksig); return(0); } Index: head/sys/arm/arm/undefined.c =================================================================== --- head/sys/arm/arm/undefined.c (revision 317060) +++ head/sys/arm/arm/undefined.c (revision 317061) @@ -1,349 +1,349 @@ /* $NetBSD: undefined.c,v 1.22 2003/11/29 22:21:29 bjh21 Exp $ */ /*- * Copyright (c) 2001 Ben Harris. * Copyright (c) 1995 Mark Brinicombe. * Copyright (c) 1995 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * undefined.c * * Fault handler * * Created : 06/01/95 */ #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif #define ARM_COPROC_INSN(insn) (((insn) & (1 << 27)) != 0) #define ARM_VFP_INSN(insn) ((((insn) & 0xfe000000) == 0xf2000000) || \ (((insn) & 0xff100000) == 0xf4000000)) #define ARM_COPROC(insn) (((insn) >> 8) & 0xf) #define THUMB_32BIT_INSN(insn) ((insn) >= 0xe800) #define THUMB_COPROC_INSN(insn) (((insn) & (3 << 26)) == (3 << 26)) #define THUMB_COPROC_UNDEFINED(insn) (((insn) & 0x3e << 20) == 0) #define THUMB_VFP_INSN(insn) (((insn) & (3 << 24)) == (3 << 24)) #define THUMB_COPROC(insn) (((insn) >> 8) & 0xf) #define COPROC_VFP 10 static int gdb_trapper(u_int, u_int, struct trapframe *, int); LIST_HEAD(, undefined_handler) undefined_handlers[MAX_COPROCS]; void * install_coproc_handler(int coproc, undef_handler_t handler) { struct undefined_handler *uh; KASSERT(coproc >= 0 && coproc < MAX_COPROCS, ("bad coproc")); KASSERT(handler != NULL, ("handler is NULL")); /* Used to be legal. */ /* XXX: M_TEMP??? */ uh = malloc(sizeof(*uh), M_TEMP, M_WAITOK); uh->uh_handler = handler; install_coproc_handler_static(coproc, uh); return uh; } void install_coproc_handler_static(int coproc, struct undefined_handler *uh) { LIST_INSERT_HEAD(&undefined_handlers[coproc], uh, uh_link); } void remove_coproc_handler(void *cookie) { struct undefined_handler *uh = cookie; LIST_REMOVE(uh, uh_link); free(uh, M_TEMP); } static int gdb_trapper(u_int addr, u_int insn, struct trapframe *frame, int code) { struct thread *td; ksiginfo_t ksi; td = (curthread == NULL) ? &thread0 : curthread; if (insn == GDB_BREAKPOINT || insn == GDB5_BREAKPOINT) { if (code == FAULT_USER) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_BRKPT; ksi.ksi_addr = (u_int32_t *)addr; trapsignal(td, &ksi); return 0; } #if 0 #ifdef KGDB return !kgdb_trap(T_BREAKPOINT, frame); #endif #endif } return 1; } static struct undefined_handler gdb_uh; void undefined_init(void) { int loop; /* Not actually necessary -- the initialiser is just NULL */ for (loop = 0; loop < MAX_COPROCS; ++loop) LIST_INIT(&undefined_handlers[loop]); /* Install handler for GDB breakpoints */ gdb_uh.uh_handler = gdb_trapper; install_coproc_handler_static(0, &gdb_uh); } void undefinedinstruction(struct trapframe *frame) { struct thread *td; u_int fault_pc; int fault_instruction; int fault_code; int coprocessor; struct undefined_handler *uh; int error; #ifdef VERBOSE_ARM32 int s; #endif ksiginfo_t ksi; /* Enable interrupts if they were enabled before the exception. */ if (__predict_true(frame->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(frame->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); fault_pc = frame->tf_pc; /* * Get the current thread/proc structure or thread0/proc0 if there is * none. */ td = curthread == NULL ? &thread0 : curthread; coprocessor = 0; if ((frame->tf_spsr & PSR_T) == 0) { /* * Make sure the program counter is correctly aligned so we * don't take an alignment fault trying to read the opcode. */ if (__predict_false((fault_pc & 3) != 0)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_code = ILL_ILLADR; ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc; trapsignal(td, &ksi); userret(td, frame); return; } /* * Should use fuword() here .. but in the interests of * squeezing every bit of speed we will just use ReadWord(). * We know the instruction can be read as was just executed * so this will never fail unless the kernel is screwed up * in which case it does not really matter does it ? */ fault_instruction = *(u_int32_t *)fault_pc; /* Check for coprocessor instruction */ /* * According to the datasheets you only need to look at bit * 27 of the instruction to tell the difference between and * undefined instruction and a coprocessor instruction * following an undefined instruction trap. */ if (ARM_COPROC_INSN(fault_instruction)) coprocessor = ARM_COPROC(fault_instruction); else { /* check for special instructions */ if (ARM_VFP_INSN(fault_instruction)) coprocessor = COPROC_VFP; /* vfp / simd */ } } else { #if __ARM_ARCH >= 7 fault_instruction = *(uint16_t *)fault_pc; if (THUMB_32BIT_INSN(fault_instruction)) { fault_instruction <<= 16; fault_instruction |= *(uint16_t *)(fault_pc + 2); /* * Is it a Coprocessor, Advanced SIMD, or * Floating-point instruction. */ if (THUMB_COPROC_INSN(fault_instruction)) { if (THUMB_COPROC_UNDEFINED(fault_instruction)) { /* undefined insn */ } else if (THUMB_VFP_INSN(fault_instruction)) coprocessor = COPROC_VFP; else coprocessor = THUMB_COPROC(fault_instruction); } } #else /* * No support for Thumb-2 on this cpu */ ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_code = ILL_ILLADR; ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc; trapsignal(td, &ksi); userret(td, frame); return; #endif } if ((frame->tf_spsr & PSR_MODE) == PSR_USR32_MODE) { /* * Modify the fault_code to reflect the USR/SVC state at * time of fault. */ fault_code = FAULT_USER; td->td_frame = frame; } else fault_code = 0; /* OK this is were we do something about the instruction. */ LIST_FOREACH(uh, &undefined_handlers[coprocessor], uh_link) if (uh->uh_handler(fault_pc, fault_instruction, frame, fault_code) == 0) break; if (fault_code & FAULT_USER) { /* TODO: No support for ptrace from Thumb-2 */ if ((frame->tf_spsr & PSR_T) == 0 && fault_instruction == PTRACE_BREAKPOINT) { PROC_LOCK(td->td_proc); _PHOLD(td->td_proc); error = ptrace_clear_single_step(td); _PRELE(td->td_proc); PROC_UNLOCK(td->td_proc); if (error != 0) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_code = ILL_ILLOPC; ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc; trapsignal(td, &ksi); } return; } } if (uh == NULL && (fault_code & FAULT_USER)) { /* Fault has not been handled */ ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGILL; ksi.ksi_code = ILL_ILLOPC; ksi.ksi_addr = (u_int32_t *)(intptr_t) fault_pc; trapsignal(td, &ksi); } if ((fault_code & FAULT_USER) == 0) { if (fault_instruction == KERNEL_BREAKPOINT) { #ifdef KDB kdb_trap(T_BREAKPOINT, 0, frame); #else printf("No debugger in kernel.\n"); #endif return; } else panic("Undefined instruction in kernel.\n"); } userret(td, frame); } Index: head/sys/arm/include/counter.h =================================================================== --- head/sys/arm/include/counter.h (revision 317060) +++ head/sys/arm/include/counter.h (revision 317061) @@ -1,87 +1,91 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #include +extern struct pcpu __pcpu[]; + +#define EARLY_COUNTER &__pcpu[0].pc_early_dummy_counter + #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) * cpu))); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)), 0); } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) counter_u64_add(c, inc) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { atomic_add_64((uint64_t *)zpcpu_get(c), inc); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/arm/include/pcpu.h =================================================================== --- head/sys/arm/include/pcpu.h (revision 317060) +++ head/sys/arm/include/pcpu.h (revision 317061) @@ -1,146 +1,146 @@ /*- * Copyright (c) 1999 Luoqi Chen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/include/globaldata.h,v 1.27 2001/04/27 * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifdef _KERNEL #include #include #define ALT_STACK_SIZE 128 struct vmspace; #endif /* _KERNEL */ #if __ARM_ARCH >= 6 #define PCPU_MD_FIELDS \ unsigned int pc_vfpsid; \ unsigned int pc_vfpmvfr0; \ unsigned int pc_vfpmvfr1; \ struct pmap *pc_curpmap; \ struct mtx pc_cmap_lock; \ void *pc_cmap1_pte2p; \ void *pc_cmap2_pte2p; \ caddr_t pc_cmap1_addr; \ caddr_t pc_cmap2_addr; \ vm_offset_t pc_qmap_addr; \ void *pc_qmap_pte2p; \ unsigned int pc_dbreg[32]; \ int pc_dbreg_cmd; \ - char __pad[27] + char __pad[155] #else #define PCPU_MD_FIELDS \ - char __pad[157] + char __pad[93] #endif #ifdef _KERNEL #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 struct pcb; struct pcpu; extern struct pcpu *pcpup; #if __ARM_ARCH >= 6 #define CPU_MASK (0xf) #ifndef SMP #define get_pcpu() (pcpup) #else #define get_pcpu() __extension__ ({ \ int id; \ __asm __volatile("mrc p15, 0, %0, c0, c0, 5" : "=r" (id)); \ (pcpup + (id & CPU_MASK)); \ }) #endif static inline struct thread * get_curthread(void) { void *ret; __asm __volatile("mrc p15, 0, %0, c13, c0, 4" : "=r" (ret)); return (ret); } static inline void set_curthread(struct thread *td) { __asm __volatile("mcr p15, 0, %0, c13, c0, 4" : : "r" (td)); } static inline void * get_tls(void) { void *tls; /* TPIDRURW contains the authoritative value. */ __asm __volatile("mrc p15, 0, %0, c13, c0, 2" : "=r" (tls)); return (tls); } static inline void set_tls(void *tls) { /* * Update both TPIDRURW and TPIDRURO. TPIDRURW needs to be written * first to ensure that a context switch between the two writes will * still give the desired result of updating both. */ __asm __volatile( "mcr p15, 0, %0, c13, c0, 2\n" "mcr p15, 0, %0, c13, c0, 3\n" : : "r" (tls)); } #define curthread get_curthread() #else #define get_pcpu() pcpup #endif #define PCPU_GET(member) (get_pcpu()->pc_ ## member) #define PCPU_ADD(member, value) (get_pcpu()->pc_ ## member += (value)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&get_pcpu()->pc_ ## member) #define PCPU_SET(member,value) (get_pcpu()->pc_ ## member = (value)) void pcpu0_init(void); #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/arm64/include/counter.h =================================================================== --- head/sys/arm64/include/counter.h (revision 317060) +++ head/sys/arm64/include/counter.h (revision 317061) @@ -1,85 +1,89 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_COUNTER_H_ #define _MACHINE_COUNTER_H_ #include #include +extern struct pcpu __pcpu[]; + +#define EARLY_COUNTER &__pcpu[0].pc_early_dummy_counter + #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) counter_u64_add(c, inc) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { atomic_add_64((uint64_t *)zpcpu_get(c), inc); } #endif /* ! _MACHINE_COUNTER_H_ */ Index: head/sys/arm64/include/pcpu.h =================================================================== --- head/sys/arm64/include/pcpu.h (revision 317060) +++ head/sys/arm64/include/pcpu.h (revision 317061) @@ -1,77 +1,77 @@ /*- * Copyright (c) 1999 Luoqi Chen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/include/globaldata.h,v 1.27 2001/04/27 * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #include #include #define ALT_STACK_SIZE 128 #define PCPU_MD_FIELDS \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_midr; /* stored MIDR value */ \ uint64_t pc_clock; \ - char __pad[113] + char __pad[241] #ifdef _KERNEL struct pcb; struct pcpu; static inline struct pcpu * get_pcpu(void) { struct pcpu *pcpu; __asm __volatile("mov %0, x18" : "=&r"(pcpu)); return (pcpu); } static inline struct thread * get_curthread(void) { struct thread *td; __asm __volatile("ldr %0, [x18]" : "=&r"(td)); return (td); } #define curthread get_curthread() #define PCPU_GET(member) (get_pcpu()->pc_ ## member) #define PCPU_ADD(member, value) (get_pcpu()->pc_ ## member += (value)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&get_pcpu()->pc_ ## member) #define PCPU_SET(member,value) (get_pcpu()->pc_ ## member = (value)) #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 317060) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 317061) @@ -1,6076 +1,6076 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_subtract(pp->object, 1); } static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_hold(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unhold(vm_page_t pp) { vm_page_lock(pp); vm_page_unhold(pp); vm_page_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeupn(obj, 0); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_hold(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ #ifdef illumos size_t cbytes; #endif abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); #ifdef illumos if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); #else ssize_t resid = uio->uio_resid; error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); if (error != 0) { uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; dmu_return_arcbuf(abuf); break; } #endif } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } #ifdef illumos ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); #endif } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= zp->z_blksz); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* fast path (should be redundant with vfs namecache) */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; #endif vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = INT_MAX; return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; vm_page_t mlast; vm_object_t object; caddr_t va; struct sf_buf *sf; off_t startoff, endoff; int i, error; vm_pindex_t reqstart, reqend; int lsize, size; object = m[0]->object; error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_vmobject_wlock(object); if (m[count - 1]->valid != 0 && --count == 0) { zfs_vmobject_wunlock(object); goto out; } mlast = m[count - 1]; if (IDX_TO_OFF(mlast->pindex) >= object->un_pager.vnp.vnp_size) { zfs_vmobject_wunlock(object); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, count); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count); lsize = PAGE_SIZE; if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); zfs_vmobject_wunlock(object); for (i = 0; i < count; i++) { size = PAGE_SIZE; if (i == count - 1) size = lsize; va = zfs_map_page(m[i], &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), size, va, DMU_READ_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); if (error != 0) goto out; } zfs_vmobject_wlock(object); for (i = 0; i < count; i++) m[i]->valid = VM_PAGE_BITS_ALL; zfs_vmobject_wunlock(object); out: ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error == 0) { if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; return (zfs_vm_pagerret_ok); } else return (zfs_vm_pagerret_error); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { i = 0; for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, ncount); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_range_unlock(rl); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the security.jail.chflags_allowed sysctl is * is non-zero; otherwise, they behave like unprivileged * processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) error = vop_stdpathconf(ap); return (error); } static int zfs_freebsd_fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_ACL_PATH_MAX: case _PC_MAC_PRESENT: return (zfs_freebsd_pathconf(ap)); default: return (fifo_specops.vop_pathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); uint64_t parent; int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vhold(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { vnode_t *vp; znode_t *zp; int err; err = vop_stdlock(ap); if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_fifo_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; Index: head/sys/compat/linprocfs/linprocfs.c =================================================================== --- head/sys/compat/linprocfs/linprocfs.c (revision 317060) +++ head/sys/compat/linprocfs/linprocfs.c (revision 317061) @@ -1,1614 +1,1614 @@ /*- * Copyright (c) 2000 Dag-Erling Coïdan Smørgrav * Copyright (c) 1999 Pierre Beyssac * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif /* __i386__ || __amd64__ */ #include #include #include #include #include #include /* * Various conversion macros */ #define T2J(x) ((long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to jiffies */ #define T2CS(x) ((unsigned long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to centiseconds */ #define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */ #define B2K(x) ((x) >> 10) /* bytes to kbytes */ #define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */ #define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */ #define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */ #define TV2J(x) ((x)->tv_sec * 100UL + (x)->tv_usec / 10000) /** * @brief Mapping of ki_stat in struct kinfo_proc to the linux state * * The linux procfs state field displays one of the characters RSDZTW to * denote running, sleeping in an interruptible wait, waiting in an * uninterruptible disk sleep, a zombie process, process is being traced * or stopped, or process is paging respectively. * * Our struct kinfo_proc contains the variable ki_stat which contains a * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK. * * This character array is used with ki_stati-1 as an index and tries to * map our states to suitable linux states. */ static char linux_state[] = "RRSTZDD"; /* * Filler function for proc/meminfo */ static int linprocfs_domeminfo(PFS_FILL_ARGS) { unsigned long memtotal; /* total memory in bytes */ unsigned long memused; /* used memory in bytes */ unsigned long memfree; /* free memory in bytes */ unsigned long buffers, cached; /* buffer / cache memory ??? */ unsigned long long swaptotal; /* total swap space in bytes */ unsigned long long swapused; /* used swap space in bytes */ unsigned long long swapfree; /* free swap space in bytes */ int i, j; memtotal = physmem * PAGE_SIZE; /* * The correct thing here would be: * memfree = vm_cnt.v_free_count * PAGE_SIZE; memused = memtotal - memfree; * * but it might mislead linux binaries into thinking there * is very little memory left, so we cheat and tell them that * all memory that isn't wired down is free. */ memused = vm_cnt.v_wire_count * PAGE_SIZE; memfree = memtotal - memused; swap_pager_status(&i, &j); swaptotal = (unsigned long long)i * PAGE_SIZE; swapused = (unsigned long long)j * PAGE_SIZE; swapfree = swaptotal - swapused; /* * We'd love to be able to write: * buffers = bufspace; * * but bufspace is internal to vfs_bio.c and we don't feel * like unstaticizing it just for linprocfs's sake. */ buffers = 0; cached = vm_cnt.v_inactive_count * PAGE_SIZE; sbuf_printf(sb, "MemTotal: %9lu kB\n" "MemFree: %9lu kB\n" "Buffers: %9lu kB\n" "Cached: %9lu kB\n" "SwapTotal:%9llu kB\n" "SwapFree: %9llu kB\n", B2K(memtotal), B2K(memfree), B2K(buffers), B2K(cached), B2K(swaptotal), B2K(swapfree)); return (0); } #if defined(__i386__) || defined(__amd64__) /* * Filler function for proc/cpuinfo (i386 & amd64 version) */ static int linprocfs_docpuinfo(PFS_FILL_ARGS) { int hw_model[2]; char model[128]; uint64_t freq; size_t size; int fqmhz, fqkhz; int i; /* * We default the flags to include all non-conflicting flags, * and the Intel versions of conflicting flags. */ static char *flags[] = { "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", "sep", "sep", "mtrr", "pge", "mca", "cmov", "pat", "pse36", "pn", "b19", "b20", "b21", "mmxext", "mmx", "fxsr", "xmm", "sse2", "b27", "b28", "b29", "3dnowext", "3dnow" }; hw_model[0] = CTL_HW; hw_model[1] = HW_MODEL; model[0] = '\0'; size = sizeof(model); if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0) strcpy(model, "unknown"); for (i = 0; i < mp_ncpus; ++i) { sbuf_printf(sb, "processor\t: %d\n" "vendor_id\t: %.20s\n" "cpu family\t: %u\n" "model\t\t: %u\n" "model name\t: %s\n" "stepping\t: %u\n\n", i, cpu_vendor, CPUID_TO_FAMILY(cpu_id), CPUID_TO_MODEL(cpu_id), model, cpu_id & CPUID_STEPPING); /* XXX per-cpu vendor / class / model / id? */ } sbuf_cat(sb, "flags\t\t:"); #ifdef __i386__ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if (cpu_class < CPUCLASS_686) flags[16] = "fcmov"; break; case CPU_VENDOR_CYRIX: flags[24] = "cxmmx"; break; } #endif for (i = 0; i < 32; i++) if (cpu_feature & (1 << i)) sbuf_printf(sb, " %s", flags[i]); sbuf_cat(sb, "\n"); freq = atomic_load_acq_64(&tsc_freq); if (freq != 0) { fqmhz = (freq + 4999) / 1000000; fqkhz = ((freq + 4999) / 10000) % 100; sbuf_printf(sb, "cpu MHz\t\t: %d.%02d\n" "bogomips\t: %d.%02d\n", fqmhz, fqkhz, fqmhz, fqkhz); } return (0); } #endif /* __i386__ || __amd64__ */ /* * Filler function for proc/mtab * * This file doesn't exist in Linux' procfs, but is included here so * users can symlink /compat/linux/etc/mtab to /proc/mtab */ static int linprocfs_domtab(PFS_FILL_ARGS) { struct nameidata nd; const char *lep; char *dlep, *flep, *mntto, *mntfrom, *fstype; size_t lep_len; int error; struct statfs *buf, *sp; size_t count; /* resolve symlinks etc. in the emulation tree prefix */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td); flep = NULL; error = namei(&nd); lep = linux_emul_path; if (error == 0) { if (vn_fullpath(td, nd.ni_vp, &dlep, &flep) == 0) lep = dlep; vrele(nd.ni_vp); } lep_len = strlen(lep); buf = NULL; error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count, UIO_SYSSPACE, MNT_WAIT); if (error != 0) { free(buf, M_TEMP); free(flep, M_TEMP); return (error); } for (sp = buf; count > 0; sp++, count--) { /* determine device name */ mntfrom = sp->f_mntfromname; /* determine mount point */ mntto = sp->f_mntonname; if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/') mntto += lep_len; /* determine fs type */ fstype = sp->f_fstypename; if (strcmp(fstype, pn->pn_info->pi_name) == 0) mntfrom = fstype = "proc"; else if (strcmp(fstype, "procfs") == 0) continue; if (strcmp(fstype, "linsysfs") == 0) { sbuf_printf(sb, "/sys %s sysfs %s", mntto, sp->f_flags & MNT_RDONLY ? "ro" : "rw"); } else { /* For Linux msdosfs is called vfat */ if (strcmp(fstype, "msdosfs") == 0) fstype = "vfat"; sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype, sp->f_flags & MNT_RDONLY ? "ro" : "rw"); } #define ADD_OPTION(opt, name) \ if (sp->f_flags & (opt)) sbuf_printf(sb, "," name); ADD_OPTION(MNT_SYNCHRONOUS, "sync"); ADD_OPTION(MNT_NOEXEC, "noexec"); ADD_OPTION(MNT_NOSUID, "nosuid"); ADD_OPTION(MNT_UNION, "union"); ADD_OPTION(MNT_ASYNC, "async"); ADD_OPTION(MNT_SUIDDIR, "suiddir"); ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow"); ADD_OPTION(MNT_NOATIME, "noatime"); #undef ADD_OPTION /* a real Linux mtab will also show NFS options */ sbuf_printf(sb, " 0 0\n"); } free(buf, M_TEMP); free(flep, M_TEMP); return (error); } /* * Filler function for proc/partitions */ static int linprocfs_dopartitions(PFS_FILL_ARGS) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp; int major, minor; g_topology_lock(); sbuf_printf(sb, "major minor #blocks name rio rmerge rsect " "ruse wio wmerge wsect wuse running use aveq\n"); LIST_FOREACH(cp, &g_classes, class) { if (strcmp(cp->name, "DISK") == 0 || strcmp(cp->name, "PART") == 0) LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (linux_driver_get_major_minor( pp->name, &major, &minor) != 0) { major = 0; minor = 0; } sbuf_printf(sb, "%d %d %lld %s " "%d %d %d %d %d " "%d %d %d %d %d %d\n", major, minor, (long long)pp->mediasize, pp->name, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } } } g_topology_unlock(); return (0); } /* * Filler function for proc/stat */ static int linprocfs_dostat(PFS_FILL_ARGS) { struct pcpu *pcpu; long cp_time[CPUSTATES]; long *cp; struct timeval boottime; int i; read_cpu_time(cp_time); getboottime(&boottime); sbuf_printf(sb, "cpu %ld %ld %ld %ld\n", T2J(cp_time[CP_USER]), T2J(cp_time[CP_NICE]), T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/), T2J(cp_time[CP_IDLE])); CPU_FOREACH(i) { pcpu = pcpu_find(i); cp = pcpu->pc_cp_time; sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i, T2J(cp[CP_USER]), T2J(cp[CP_NICE]), T2J(cp[CP_SYS] /*+ cp[CP_INTR]*/), T2J(cp[CP_IDLE])); } sbuf_printf(sb, "disk 0 0 0 0\n" - "page %u %u\n" - "swap %u %u\n" - "intr %u\n" - "ctxt %u\n" + "page %ju %ju\n" + "swap %ju %ju\n" + "intr %ju\n" + "ctxt %ju\n" "btime %lld\n", - vm_cnt.v_vnodepgsin, - vm_cnt.v_vnodepgsout, - vm_cnt.v_swappgsin, - vm_cnt.v_swappgsout, - vm_cnt.v_intr, - vm_cnt.v_swtch, + (uintmax_t)VM_CNT_FETCH(v_vnodepgsin), + (uintmax_t)VM_CNT_FETCH(v_vnodepgsout), + (uintmax_t)VM_CNT_FETCH(v_swappgsin), + (uintmax_t)VM_CNT_FETCH(v_swappgsout), + (uintmax_t)VM_CNT_FETCH(v_intr), + (uintmax_t)VM_CNT_FETCH(v_swtch), (long long)boottime.tv_sec); return (0); } static int linprocfs_doswaps(PFS_FILL_ARGS) { struct xswdev xsw; uintmax_t total, used; int n; char devname[SPECNAMELEN + 1]; sbuf_printf(sb, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); for (n = 0; ; n++) { if (swap_dev_info(n, &xsw, devname, sizeof(devname)) != 0) break; total = (uintmax_t)xsw.xsw_nblks * PAGE_SIZE / 1024; used = (uintmax_t)xsw.xsw_used * PAGE_SIZE / 1024; /* * The space and not tab after the device name is on * purpose. Linux does so. */ sbuf_printf(sb, "/dev/%-34s unknown\t\t%jd\t%jd\t-1\n", devname, total, used); } return (0); } /* * Filler function for proc/uptime */ static int linprocfs_douptime(PFS_FILL_ARGS) { long cp_time[CPUSTATES]; struct timeval tv; getmicrouptime(&tv); read_cpu_time(cp_time); sbuf_printf(sb, "%lld.%02ld %ld.%02lu\n", (long long)tv.tv_sec, tv.tv_usec / 10000, T2S(cp_time[CP_IDLE] / mp_ncpus), T2CS(cp_time[CP_IDLE] / mp_ncpus) % 100); return (0); } /* * Get OS build date */ static void linprocfs_osbuild(struct thread *td, struct sbuf *sb) { #if 0 char osbuild[256]; char *cp1, *cp2; strncpy(osbuild, version, 256); osbuild[255] = '\0'; cp1 = strstr(osbuild, "\n"); cp2 = strstr(osbuild, ":"); if (cp1 && cp2) { *cp1 = *cp2 = '\0'; cp1 = strstr(osbuild, "#"); } else cp1 = NULL; if (cp1) sbuf_printf(sb, "%s%s", cp1, cp2 + 1); else #endif sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977"); } /* * Get OS builder */ static void linprocfs_osbuilder(struct thread *td, struct sbuf *sb) { #if 0 char builder[256]; char *cp; cp = strstr(version, "\n "); if (cp) { strncpy(builder, cp + 5, 256); builder[255] = '\0'; cp = strstr(builder, ":"); if (cp) *cp = '\0'; } if (cp) sbuf_cat(sb, builder); else #endif sbuf_cat(sb, "des@freebsd.org"); } /* * Filler function for proc/version */ static int linprocfs_doversion(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s version %s (", osname, osrelease); linprocfs_osbuilder(td, sb); sbuf_cat(sb, ") (gcc version " __VERSION__ ") "); linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/loadavg */ static int linprocfs_doloadavg(PFS_FILL_ARGS) { sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", (int)(averunnable.ldavg[0] / averunnable.fscale), (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[1] / averunnable.fscale), (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ ); return (0); } /* * Filler function for proc/pid/stat */ static int linprocfs_doprocstat(PFS_FILL_ARGS) { struct kinfo_proc kp; struct timeval boottime; char state; static int ratelimit = 0; vm_offset_t startcode, startdata; getboottime(&boottime); sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); sx_sunlock(&proctree_lock); if (p->p_vmspace) { startcode = (vm_offset_t)p->p_vmspace->vm_taddr; startdata = (vm_offset_t)p->p_vmspace->vm_daddr; } else { startcode = 0; startdata = 0; } sbuf_printf(sb, "%d", p->p_pid); #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg) PS_ADD("comm", "(%s)", p->p_comm); if (kp.ki_stat > sizeof(linux_state)) { state = 'R'; if (ratelimit == 0) { printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n", kp.ki_stat, sizeof(linux_state)); ++ratelimit; } } else state = linux_state[kp.ki_stat - 1]; PS_ADD("state", "%c", state); PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0); PS_ADD("pgrp", "%d", p->p_pgid); PS_ADD("session", "%d", p->p_session->s_sid); PROC_UNLOCK(p); PS_ADD("tty", "%ju", (uintmax_t)kp.ki_tdev); PS_ADD("tpgid", "%d", kp.ki_tpgid); PS_ADD("flags", "%u", 0); /* XXX */ PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt); PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt); PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt); PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt); PS_ADD("utime", "%ld", TV2J(&kp.ki_rusage.ru_utime)); PS_ADD("stime", "%ld", TV2J(&kp.ki_rusage.ru_stime)); PS_ADD("cutime", "%ld", TV2J(&kp.ki_rusage_ch.ru_utime)); PS_ADD("cstime", "%ld", TV2J(&kp.ki_rusage_ch.ru_stime)); PS_ADD("priority", "%d", kp.ki_pri.pri_user); PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */ PS_ADD("0", "%d", 0); /* removed field */ PS_ADD("itrealvalue", "%d", 0); /* XXX */ PS_ADD("starttime", "%lu", TV2J(&kp.ki_start) - TV2J(&boottime)); PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size)); PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize); PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss); PS_ADD("startcode", "%ju", (uintmax_t)startcode); PS_ADD("endcode", "%ju", (uintmax_t)startdata); PS_ADD("startstack", "%u", 0); /* XXX */ PS_ADD("kstkesp", "%u", 0); /* XXX */ PS_ADD("kstkeip", "%u", 0); /* XXX */ PS_ADD("signal", "%u", 0); /* XXX */ PS_ADD("blocked", "%u", 0); /* XXX */ PS_ADD("sigignore", "%u", 0); /* XXX */ PS_ADD("sigcatch", "%u", 0); /* XXX */ PS_ADD("wchan", "%u", 0); /* XXX */ PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap); PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap); PS_ADD("exitsignal", "%d", 0); /* XXX */ PS_ADD("processor", "%u", kp.ki_lastcpu); PS_ADD("rt_priority", "%u", 0); /* XXX */ /* >= 2.5.19 */ PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */ #undef PS_ADD sbuf_putc(sb, '\n'); return (0); } /* * Filler function for proc/pid/statm */ static int linprocfs_doprocstatm(PFS_FILL_ARGS) { struct kinfo_proc kp; segsz_t lsize; sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* * See comments in linprocfs_doprocstatus() regarding the * computation of lsize. */ /* size resident share trs drs lrs dt */ sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size)); sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize); sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */ sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize); sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "%ju ", (uintmax_t)lsize); sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */ return (0); } /* * Filler function for proc/pid/status */ static int linprocfs_doprocstatus(PFS_FILL_ARGS) { struct kinfo_proc kp; char *state; segsz_t lsize; struct thread *td2; struct sigacts *ps; l_sigset_t siglist, sigignore, sigcatch; int i; sx_slock(&proctree_lock); PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { switch(p->p_state) { case PRS_NEW: state = "I (idle)"; break; case PRS_NORMAL: if (p->p_flag & P_WEXIT) { state = "X (exiting)"; break; } switch(td2->td_state) { case TDS_INHIBITED: state = "S (sleeping)"; break; case TDS_RUNQ: case TDS_RUNNING: state = "R (running)"; break; default: state = "? (unknown)"; break; } break; case PRS_ZOMBIE: state = "Z (zombie)"; break; default: state = "? (unknown)"; break; } } fill_kinfo_proc(p, &kp); sx_sunlock(&proctree_lock); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); /* * Credentials */ sbuf_printf(sb, "Pid:\t%d\n", p->p_pid); sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ? p->p_pptr->p_pid : 0); sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid, p->p_ucred->cr_uid, p->p_ucred->cr_svuid, /* FreeBSD doesn't have fsuid */ p->p_ucred->cr_uid); sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid, p->p_ucred->cr_gid, p->p_ucred->cr_svgid, /* FreeBSD doesn't have fsgid */ p->p_ucred->cr_gid); sbuf_cat(sb, "Groups:\t"); for (i = 0; i < p->p_ucred->cr_ngroups; i++) sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]); PROC_UNLOCK(p); sbuf_putc(sb, '\n'); /* * Memory * * While our approximation of VmLib may not be accurate (I * don't know of a simple way to verify it, and I'm not sure * it has much meaning anyway), I believe it's good enough. * * The same code that could (I think) accurately compute VmLib * could also compute VmLck, but I don't really care enough to * implement it. Submissions are welcome. */ sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size)); sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */ sbuf_printf(sb, "VmRSS:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize)); sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize)); sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize)); sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize)); /* * Signal masks */ PROC_LOCK(p); bsd_to_linux_sigset(&p->p_siglist, &siglist); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); bsd_to_linux_sigset(&ps->ps_sigignore, &sigignore); bsd_to_linux_sigset(&ps->ps_sigcatch, &sigcatch); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); sbuf_printf(sb, "SigPnd:\t%016jx\n", siglist.__mask); /* * XXX. SigBlk - target thread's signal mask, td_sigmask. * To implement SigBlk pseudofs should support proc/tid dir entries. */ sbuf_printf(sb, "SigBlk:\t%016x\n", 0); sbuf_printf(sb, "SigIgn:\t%016jx\n", sigignore.__mask); sbuf_printf(sb, "SigCgt:\t%016jx\n", sigcatch.__mask); /* * Linux also prints the capability masks, but we don't have * capabilities yet, and when we do get them they're likely to * be meaningless to Linux programs, so we lie. XXX */ sbuf_printf(sb, "CapInh:\t%016x\n", 0); sbuf_printf(sb, "CapPrm:\t%016x\n", 0); sbuf_printf(sb, "CapEff:\t%016x\n", 0); return (0); } /* * Filler function for proc/pid/cwd */ static int linprocfs_doproccwd(PFS_FILL_ARGS) { struct filedesc *fdp; struct vnode *vp; char *fullpath = "unknown"; char *freepath = NULL; fdp = p->p_fd; FILEDESC_SLOCK(fdp); vp = fdp->fd_cdir; if (vp != NULL) VREF(vp); FILEDESC_SUNLOCK(fdp); vn_fullpath(td, vp, &fullpath, &freepath); if (vp != NULL) vrele(vp); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/root */ static int linprocfs_doprocroot(PFS_FILL_ARGS) { struct filedesc *fdp; struct vnode *vp; char *fullpath = "unknown"; char *freepath = NULL; fdp = p->p_fd; FILEDESC_SLOCK(fdp); vp = jailed(p->p_ucred) ? fdp->fd_jdir : fdp->fd_rdir; if (vp != NULL) VREF(vp); FILEDESC_SUNLOCK(fdp); vn_fullpath(td, vp, &fullpath, &freepath); if (vp != NULL) vrele(vp); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); return (0); } /* * Filler function for proc/pid/cmdline */ static int linprocfs_doproccmdline(PFS_FILL_ARGS) { int ret; PROC_LOCK(p); if ((ret = p_cansee(td, p)) != 0) { PROC_UNLOCK(p); return (ret); } /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) { PROC_UNLOCK(p); return (0); } if (p->p_args != NULL) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); return (0); } if ((p->p_flag & P_SYSTEM) != 0) { PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); ret = proc_getargv(td, p, sb); return (ret); } /* * Filler function for proc/pid/environ */ static int linprocfs_doprocenviron(PFS_FILL_ARGS) { /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) return (0); return (proc_getenvv(td, p, sb)); } static char l32_map_str[] = "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char l64_map_str[] = "%016lx-%016lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char vdso_str[] = " [vdso]"; static char stack_str[] = " [stack]"; /* * Filler function for proc/pid/maps */ static int linprocfs_doprocmaps(PFS_FILL_ARGS) { struct vmspace *vm; vm_map_t map; vm_map_entry_t entry, tmp_entry; vm_object_t obj, tobj, lobj; vm_offset_t e_start, e_end; vm_ooffset_t off = 0; vm_prot_t e_prot; unsigned int last_timestamp; char *name = "", *freename = NULL; const char *l_map_str; ino_t ino; int ref_count, shadow_count, flags; int error; struct vnode *vp; struct vattr vat; PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); error = 0; vm = vmspace_acquire_ref(p); if (vm == NULL) return (ESRCH); if (SV_CURPROC_FLAG(SV_LP64)) l_map_str = l64_map_str; else l_map_str = l32_map_str; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { name = ""; freename = NULL; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; e_prot = entry->protection; e_start = entry->start; e_end = entry->end; obj = entry->object.vm_object; for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } last_timestamp = map->timestamp; vm_map_unlock_read(map); ino = 0; if (lobj) { off = IDX_TO_OFF(lobj->size); vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(td, vp, &name, &freename); vn_lock(vp, LK_SHARED | LK_RETRY); VOP_GETATTR(vp, &vat, td->td_ucred); ino = vat.va_fileid; vput(vp); } else if (SV_PROC_ABI(p) == SV_ABI_LINUX) { if (e_start == p->p_sysent->sv_shared_page_base) name = vdso_str; if (e_end == p->p_sysent->sv_usrstack) name = stack_str; } } else { flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, access, offset, major, minor, inode, name. */ error = sbuf_printf(sb, l_map_str, (u_long)e_start, (u_long)e_end, (e_prot & VM_PROT_READ)?"r":"-", (e_prot & VM_PROT_WRITE)?"w":"-", (e_prot & VM_PROT_EXECUTE)?"x":"-", "p", (u_long)off, 0, 0, (u_long)ino, *name ? " " : "", name ); if (freename) free(freename, M_TEMP); vm_map_lock_read(map); if (error == -1) { error = 0; break; } if (last_timestamp != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, e_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) static int linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen) { struct ifnet *ifscan; int ethno; IFNET_RLOCK_ASSERT(); /* Short-circuit non ethernet interfaces */ if (!IFP_IS_ETH(ifp)) return (strlcpy(buffer, ifp->if_xname, buflen)); /* Determine the (relative) unit number for ethernet interfaces */ ethno = 0; TAILQ_FOREACH(ifscan, &V_ifnet, if_link) { if (ifscan == ifp) return (snprintf(buffer, buflen, "eth%d", ethno)); if (IFP_IS_ETH(ifscan)) ethno++; } return (0); } /* * Filler function for proc/net/dev */ static int linprocfs_donetdev(PFS_FILL_ARGS) { char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; sbuf_printf(sb, "%6s|%58s|%s\n" "%6s|%58s|%58s\n", "Inter-", " Receive", " Transmit", " face", "bytes packets errs drop fifo frame compressed multicast", "bytes packets errs drop fifo colls carrier compressed"); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s: ", ifname); sbuf_printf(sb, "%7ju %7ju %4ju %4ju %4lu %5lu %10lu %9ju ", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS), /* rx_missed_errors */ 0UL, /* rx_fifo_errors */ 0UL, /* rx_length_errors + * rx_over_errors + * rx_crc_errors + * rx_frame_errors */ 0UL, /* rx_compressed */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS)); /* XXX-BZ rx only? */ sbuf_printf(sb, "%8ju %7ju %4ju %4ju %4lu %5ju %7lu %10lu\n", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS), 0UL, /* tx_fifo_errors */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS), 0UL, /* tx_carrier_errors + * tx_aborted_errors + * tx_window_errors + * tx_heartbeat_errors*/ 0UL); /* tx_compressed */ } IFNET_RUNLOCK(); CURVNET_RESTORE(); return (0); } /* * Filler function for proc/sys/kernel/osrelease */ static int linprocfs_doosrelease(PFS_FILL_ARGS) { char osrelease[LINUX_MAX_UTSNAME]; linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s\n", osrelease); return (0); } /* * Filler function for proc/sys/kernel/ostype */ static int linprocfs_doostype(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); sbuf_printf(sb, "%s\n", osname); return (0); } /* * Filler function for proc/sys/kernel/version */ static int linprocfs_doosbuild(PFS_FILL_ARGS) { linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/sys/kernel/msgmni */ static int linprocfs_domsgmni(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmni); return (0); } /* * Filler function for proc/sys/kernel/pid_max */ static int linprocfs_dopid_max(PFS_FILL_ARGS) { sbuf_printf(sb, "%i\n", PID_MAX); return (0); } /* * Filler function for proc/sys/kernel/sem */ static int linprocfs_dosem(PFS_FILL_ARGS) { sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns, seminfo.semopm, seminfo.semmni); return (0); } /* * Filler function for proc/scsi/device_info */ static int linprocfs_doscsidevinfo(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/scsi/scsi */ static int linprocfs_doscsiscsi(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/devices */ static int linprocfs_dodevices(PFS_FILL_ARGS) { char *char_devices; sbuf_printf(sb, "Character devices:\n"); char_devices = linux_get_char_devices(); sbuf_printf(sb, "%s", char_devices); linux_free_get_char_devices(char_devices); sbuf_printf(sb, "\nBlock devices:\n"); return (0); } /* * Filler function for proc/cmdline */ static int linprocfs_docmdline(PFS_FILL_ARGS) { sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname); sbuf_printf(sb, " ro root=302\n"); return (0); } /* * Filler function for proc/filesystems */ static int linprocfs_dofilesystems(PFS_FILL_ARGS) { struct vfsconf *vfsp; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_flags & VFCF_SYNTHETIC) sbuf_printf(sb, "nodev"); sbuf_printf(sb, "\t%s\n", vfsp->vfc_name); } vfsconf_sunlock(); return(0); } #if 0 /* * Filler function for proc/modules */ static int linprocfs_domodules(PFS_FILL_ARGS) { struct linker_file *lf; TAILQ_FOREACH(lf, &linker_files, link) { sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename, (unsigned long)lf->size, lf->refs); } return (0); } #endif /* * Filler function for proc/pid/fd */ static int linprocfs_dofdescfs(PFS_FILL_ARGS) { if (p == curproc) sbuf_printf(sb, "/dev/fd"); else sbuf_printf(sb, "unknown"); return (0); } /* * Filler function for proc/pid/limits */ static const struct linux_rlimit_ident { const char *desc; const char *unit; unsigned int rlim_id; } linux_rlimits_ident[] = { { "Max cpu time", "seconds", RLIMIT_CPU }, { "Max file size", "bytes", RLIMIT_FSIZE }, { "Max data size", "bytes", RLIMIT_DATA }, { "Max stack size", "bytes", RLIMIT_STACK }, { "Max core file size", "bytes", RLIMIT_CORE }, { "Max resident set", "bytes", RLIMIT_RSS }, { "Max processes", "processes", RLIMIT_NPROC }, { "Max open files", "files", RLIMIT_NOFILE }, { "Max locked memory", "bytes", RLIMIT_MEMLOCK }, { "Max address space", "bytes", RLIMIT_AS }, { "Max file locks", "locks", LINUX_RLIMIT_LOCKS }, { "Max pending signals", "signals", LINUX_RLIMIT_SIGPENDING }, { "Max msgqueue size", "bytes", LINUX_RLIMIT_MSGQUEUE }, { "Max nice priority", "", LINUX_RLIMIT_NICE }, { "Max realtime priority", "", LINUX_RLIMIT_RTPRIO }, { "Max realtime timeout", "us", LINUX_RLIMIT_RTTIME }, { 0, 0, 0 } }; static int linprocfs_doproclimits(PFS_FILL_ARGS) { const struct linux_rlimit_ident *li; struct plimit *limp; struct rlimit rl; ssize_t size; int res, error; error = 0; PROC_LOCK(p); limp = lim_hold(p->p_limit); PROC_UNLOCK(p); size = sizeof(res); sbuf_printf(sb, "%-26s%-21s%-21s%-21s\n", "Limit", "Soft Limit", "Hard Limit", "Units"); for (li = linux_rlimits_ident; li->desc != NULL; ++li) { switch (li->rlim_id) { case LINUX_RLIMIT_LOCKS: /* FALLTHROUGH */ case LINUX_RLIMIT_RTTIME: rl.rlim_cur = RLIM_INFINITY; break; case LINUX_RLIMIT_SIGPENDING: error = kernel_sysctlbyname(td, "kern.sigqueue.max_pending_per_proc", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_MSGQUEUE: error = kernel_sysctlbyname(td, "kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_NICE: /* FALLTHROUGH */ case LINUX_RLIMIT_RTPRIO: rl.rlim_cur = 0; rl.rlim_max = 0; break; default: rl = limp->pl_rlimit[li->rlim_id]; break; } if (rl.rlim_cur == RLIM_INFINITY) sbuf_printf(sb, "%-26s%-21s%-21s%-10s\n", li->desc, "unlimited", "unlimited", li->unit); else sbuf_printf(sb, "%-26s%-21llu%-21llu%-10s\n", li->desc, (unsigned long long)rl.rlim_cur, (unsigned long long)rl.rlim_max, li->unit); } out: lim_free(limp); return (error); } /* * Filler function for proc/sys/kernel/random/uuid */ static int linprocfs_douuid(PFS_FILL_ARGS) { struct uuid uuid; kern_uuidgen(&uuid, 1); sbuf_printf_uuid(sb, &uuid); sbuf_printf(sb, "\n"); return(0); } /* * Filler function for proc/pid/auxv */ static int linprocfs_doauxv(PFS_FILL_ARGS) { struct sbuf *asb; off_t buflen, resid; int error; /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwise. */ if (p->p_vmspace == &vmspace0) return (0); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0 || uio->uio_resid < 0) return (EINVAL); asb = sbuf_new_auto(); if (asb == NULL) return (ENOMEM); error = proc_getauxv(td, p, asb); if (error == 0) error = sbuf_finish(asb); resid = sbuf_len(asb) - uio->uio_offset; if (resid > uio->uio_resid) buflen = uio->uio_resid; else buflen = resid; if (buflen > IOSIZE_MAX) return (EINVAL); if (buflen > MAXPHYS) buflen = MAXPHYS; if (resid <= 0) return (0); if (error == 0) error = uiomove(sbuf_data(asb) + uio->uio_offset, buflen, uio); sbuf_delete(asb); return (error); } /* * Constructor */ static int linprocfs_init(PFS_INIT_ARGS) { struct pfs_node *root; struct pfs_node *dir; root = pi->pi_root; /* /proc/... */ pfs_create_file(root, "cmdline", &linprocfs_docmdline, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "devices", &linprocfs_dodevices, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "filesystems", &linprocfs_dofilesystems, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "loadavg", &linprocfs_doloadavg, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "meminfo", &linprocfs_domeminfo, NULL, NULL, NULL, PFS_RD); #if 0 pfs_create_file(root, "modules", &linprocfs_domodules, NULL, NULL, NULL, PFS_RD); #endif pfs_create_file(root, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "mtab", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "partitions", &linprocfs_dopartitions, NULL, NULL, NULL, PFS_RD); pfs_create_link(root, "self", &procfs_docurproc, NULL, NULL, NULL, 0); pfs_create_file(root, "stat", &linprocfs_dostat, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "swaps", &linprocfs_doswaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "uptime", &linprocfs_douptime, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "version", &linprocfs_doversion, NULL, NULL, NULL, PFS_RD); /* /proc/net/... */ dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0); pfs_create_file(dir, "dev", &linprocfs_donetdev, NULL, NULL, NULL, PFS_RD); /* /proc//... */ dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP); pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "cwd", &linprocfs_doproccwd, NULL, NULL, NULL, 0); pfs_create_file(dir, "environ", &linprocfs_doprocenviron, NULL, &procfs_candebug, NULL, PFS_RD); pfs_create_link(dir, "exe", &procfs_doprocfile, NULL, &procfs_notsystem, NULL, 0); pfs_create_file(dir, "maps", &linprocfs_doprocmaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "mem", &procfs_doprocmem, &procfs_attr, &procfs_candebug, NULL, PFS_RDWR|PFS_RAW); pfs_create_file(dir, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "root", &linprocfs_doprocroot, NULL, NULL, NULL, 0); pfs_create_file(dir, "stat", &linprocfs_doprocstat, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "statm", &linprocfs_doprocstatm, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "status", &linprocfs_doprocstatus, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "fd", &linprocfs_dofdescfs, NULL, NULL, NULL, 0); pfs_create_file(dir, "auxv", &linprocfs_doauxv, NULL, &procfs_candebug, NULL, PFS_RD|PFS_RAWRD); pfs_create_file(dir, "limits", &linprocfs_doproclimits, NULL, NULL, NULL, PFS_RD); /* /proc/scsi/... */ dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0); pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi, NULL, NULL, NULL, PFS_RD); /* /proc/sys/... */ dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0); /* /proc/sys/kernel/... */ dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0); pfs_create_file(dir, "osrelease", &linprocfs_doosrelease, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ostype", &linprocfs_doostype, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "version", &linprocfs_doosbuild, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmni", &linprocfs_domsgmni, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "pid_max", &linprocfs_dopid_max, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "sem", &linprocfs_dosem, NULL, NULL, NULL, PFS_RD); /* /proc/sys/kernel/random/... */ dir = pfs_create_dir(dir, "random", NULL, NULL, NULL, 0); pfs_create_file(dir, "uuid", &linprocfs_douuid, NULL, NULL, NULL, PFS_RD); return (0); } /* * Destructor */ static int linprocfs_uninit(PFS_INIT_ARGS) { /* nothing to do, pseudofs will GC */ return (0); } PSEUDOFS(linprocfs, 1, PR_ALLOW_MOUNT_LINPROCFS); #if defined(__amd64__) MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1); #else MODULE_DEPEND(linprocfs, linux, 1, 1, 1); #endif MODULE_DEPEND(linprocfs, procfs, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1); Index: head/sys/fs/fuse/fuse_vnops.c =================================================================== --- head/sys/fs/fuse/fuse_vnops.c (revision 317060) +++ head/sys/fs/fuse/fuse_vnops.c (revision 317061) @@ -1,1977 +1,1977 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_param.h" #include "fuse_io.h" #include #define FUSE_DEBUG_MODULE VNOPS #include "fuse_debug.h" /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_inactive_t fuse_vnop_inactive; static vop_link_t fuse_vnop_link; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_putpages_t fuse_vnop_putpages; static vop_print_t fuse_vnop_print; struct vop_vector fuse_vnops = { .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_link = fuse_vnop_link, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = vop_stdpathconf, .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_putpages = fuse_vnop_putpages, .vop_print = fuse_vnop_print, }; static u_long fuse_lookup_cache_hits = 0; SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, &fuse_lookup_cache_hits, 0, ""); static u_long fuse_lookup_cache_misses = 0; SYSCTL_ULONG(_vfs_fuse, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, &fuse_lookup_cache_misses, 0, ""); int fuse_lookup_cache_enable = 1; SYSCTL_INT(_vfs_fuse, OID_AUTO, lookup_cache_enable, CTLFLAG_RW, &fuse_lookup_cache_enable, 0, ""); /* * XXX: This feature is highly experimental and can bring to instabilities, * needs revisiting before to be enabled by default. */ static int fuse_reclaim_revoked = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, reclaim_revoked, CTLFLAG_RW, &fuse_reclaim_revoked, 0, ""); int fuse_pbuf_freecnt = -1; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); #define fuse_vm_page_lock_queues() ((void)0) #define fuse_vm_page_unlock_queues() ((void)0) /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_access_param facp; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN, 0) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } bzero(&facp, sizeof(facp)); err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred); FS_DEBUG2G("err=%d accmode=0x%x\n", err, accmode); return err; } /* struct vnop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; fufh_type_t fufh_type; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return 0; } if (vnode_isdir(vp)) { if (fuse_filehandle_valid(vp, FUFH_RDONLY)) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } return 0; } if (fflag & IO_NDELAY) { return 0; } fufh_type = fuse_filehandle_xlate_from_fflags(fflag); if (!fuse_filehandle_valid(vp, fufh_type)) { int i; for (i = 0; i < FUFH_MAXTYPE; i++) if (fuse_filehandle_valid(vp, i)) break; if (i == FUFH_MAXTYPE) panic("FUSE: fufh type %d found to be invalid in close" " (fflag=0x%x)\n", fufh_type, fflag); } if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred); } return 0; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; struct fuse_open_in *foi; struct fuse_entry_out *feo; struct fuse_dispatcher fdi; struct fuse_dispatcher *fdip = &fdi; int err; struct mount *mp = vnode_mount(dvp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); uint64_t x_fh_id; uint32_t x_open_flags; fuse_trace_printf_vnop(); if (fuse_isdeadfs(dvp)) { return ENXIO; } bzero(&fdi, sizeof(fdi)); /* XXX: Will we ever want devices ? */ if ((vap->va_type != VREG)) { printf("fuse_vnop_create: unsupported va_type %d\n", vap->va_type); return (EINVAL); } debug_printf("parent nid = %ju, mode = %x\n", (uintmax_t)parentnid, mode); fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1); if (!fsess_isimpl(mp, FUSE_CREATE)) { debug_printf("eh, daemon doesn't implement create?\n"); return (EINVAL); } fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred); foi = fdip->indata; foi->mode = mode; foi->flags = O_CREAT | O_RDWR; memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0'; err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_CREATE); debug_printf("create: got err=%d from daemon\n", err); goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, VREG))) { goto out; } err = fuse_vnode_get(mp, feo->nodeid, dvp, vpp, cnp, VREG); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = OFLAGS(mode); fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick); return err; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fdip->answ = feo + 1; x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh; x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags; fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id); fuse_vnode_open(*vpp, x_open_flags, td); cache_purge_negative(dvp); out: fdisp_destroy(fdip); return err; } /* * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux * version of FUSE also has a FUSE_FLUSH method. * * On Linux, fsync() synchronizes a file's complete in-core state with that * on disk. The call is not supposed to return until the system has completed * that action or until an error is detected. * * Linux also has an fdatasync() call that is similar to fsync() but is not * required to update the metadata such as access time and modification time. */ /* struct vnop_fsync_args { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); int type, err = 0; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; if (!fsess_isimpl(vnode_mount(vp), (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { goto out; } for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { fuse_internal_fsync(vp, td, NULL, fufh); } } out: return 0; } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); int err = 0; int dataflags; struct fuse_dispatcher fdi; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; debug_printf("fuse_getattr b: returning ENOTCONN\n"); return err; } else { goto fake; } } fdisp_init(&fdi, 0); if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { if ((err == ENOTCONN) && vnode_isvroot(vp)) { /* see comment at similar place in fuse_statfs() */ fdisp_destroy(&fdi); goto fake; } if (err == ENOENT) { fuse_internal_vnode_disappear(vp); } goto out; } cache_attrs(vp, (struct fuse_attr_out *)fdi.answ); if (vap != VTOVA(vp)) { memcpy(vap, VTOVA(vp), sizeof(*vap)); } if (vap->va_type != vnode_vtype(vp)) { fuse_internal_vnode_disappear(vp); err = ENOENT; goto out; } if ((fvdat->flag & FN_SIZECHANGE) != 0) vap->va_size = fvdat->filesize; if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) { /* * This is for those cases when the file size changed without us * knowing, and we want to catch up. */ off_t new_filesize = ((struct fuse_attr_out *) fdi.answ)->attr.size; if (fvdat->filesize != new_filesize) { fuse_vnode_setsize(vp, cred, new_filesize); } } debug_printf("fuse_getattr e: returning 0\n"); out: fdisp_destroy(&fdi); return err; fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type, need_flush = 1; FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp)); for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL); } if (fuse_data_cache_invalidate || (fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, type, td, NULL); } } if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) { vrecycle(vp); } return 0; } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } if (vap->va_nlink >= FUSE_LINK_MAX) { return EMLINK; } fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); out: fdisp_destroy(&fdi); return err; } /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; enum fuse_opcode op; uint64_t nid; struct fuse_access_param facp; FS_DEBUG2G("parent_inode=%ju - %*s\n", (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) { return ENOTDIR; } if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) { return EROFS; } /* * We do access check prior to doing anything else only in the case * when we are at fs root (we'd like to say, "we are at the first * component", but that's not exactly the same... nevermind). * See further comments at further access checks. */ bzero(&facp, sizeof(facp)); if (vnode_isvroot(dvp)) { /* early permission check hack */ if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) { return err; } } if (flags & ISDOTDOT) { nid = VTOFUD(dvp)->parent_nid; if (nid == 0) { return ENOENT; } fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (fuse_lookup_cache_enable) { err = cache_lookup(dvp, vpp, cnp, NULL, NULL); switch (err) { case -1: /* positive match */ atomic_add_acq_long(&fuse_lookup_cache_hits, 1); return 0; case 0: /* no match in cache */ atomic_add_acq_long(&fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ /* fall through */ default: return err; } } nid = VTOI(dvp); fdisp_init(&fdi, cnp->cn_namelen + 1); op = FUSE_LOOKUP; calldaemon: fdisp_make(&fdi, op, mp, nid, td, cred); if (op == FUSE_LOOKUP) { memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; } lookup_err = fdisp_wait_answ(&fdi); if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */ nid = ((struct fuse_entry_out *)fdi.answ)->nodeid; if (!nid) { /* * zero nodeid is the same as "not found", * but it's also cacheable (which we keep * keep on doing not as of writing this) */ lookup_err = ENOENT; } else if (nid == FUSE_ROOT_ID) { lookup_err = EINVAL; } } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) { fdisp_destroy(&fdi); return lookup_err; } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { if ((nameiop == CREATE || nameiop == RENAME) && islastcn /* && directory dvp has not been removed */ ) { if (vfs_isrdonly(mp)) { err = EROFS; goto out; } #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Possibly record the position of a slot in the * directory large enough for the new component name. * This can be recorded in the vnode private data for * dvp. Set the SAVENAME flag to hold onto the * pathname for use later in VOP_CREATE or VOP_RENAME. */ cnp->cn_flags |= SAVENAME; err = EJUSTRETURN; goto out; } /* Consider inserting name into cache. */ /* * No we can't use negative caching, as the fs * changes are out of our control. * False positives' falseness turns out just as things * go by, but false negatives' falseness doesn't. * (and aiding the caching mechanism with extra control * mechanisms comes quite close to beating the whole purpose * caching...) */ #if 0 if ((cnp->cn_flags & MAKEENTRY) != 0) { FS_DEBUG("inserting NULL into cache\n"); cache_enter(dvp, NULL, cnp); } #endif err = ENOENT; goto out; } else { /* !lookup_err */ struct fuse_entry_out *feo = NULL; struct fuse_attr *fattr = NULL; if (op == FUSE_GETATTR) { fattr = &((struct fuse_attr_out *)fdi.answ)->attr; } else { feo = (struct fuse_entry_out *)fdi.answ; fattr = &(feo->attr); } /* * If deleting, and at end of pathname, return parameters * which can be used to remove file. If the wantparent flag * isn't set, we return only the directory, otherwise we go on * and lock the inode, being careful with ".". */ if (nameiop == DELETE && islastcn) { /* * Check for write access on directory. */ facp.xuid = fattr->uid; facp.facc_flags |= FACCESS_STICKY; err = fuse_internal_access(dvp, VWRITE, &facp, td, cred); facp.facc_flags &= ~FACCESS_XQUERIES; if (err) { goto out; } if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { err = fuse_vnode_get(dvp->v_mount, nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) goto out; *vpp = vp; } /* * Save the name for use in VOP_RMDIR and VOP_REMOVE * later. */ cnp->cn_flags |= SAVENAME; goto out; } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && wantparent && islastcn) { #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Check for "." */ if (nid == VTOI(dvp)) { err = EISDIR; goto out; } err = fuse_vnode_get(vnode_mount(dvp), nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } *vpp = vp; /* * Save the name for use in VOP_RENAME later. */ cnp->cn_flags |= SAVENAME; goto out; } if (flags & ISDOTDOT) { struct mount *mp; int ltype; /* * Expanded copy of vn_vget_ino() so that * fuse_vnode_get() can be used. */ mp = dvp->v_mount; ltype = VOP_ISLOCKED(dvp); err = vfs_busy(mp, MBF_NOWAIT); if (err != 0) { vfs_ref(mp); VOP_UNLOCK(dvp, 0); err = vfs_busy(mp, 0); vn_lock(dvp, ltype | LK_RETRY); vfs_rel(mp); if (err) goto out; if ((dvp->v_iflag & VI_DOOMED) != 0) { err = ENOENT; vfs_unbusy(mp); goto out; } } VOP_UNLOCK(dvp, 0); err = fuse_vnode_get(vnode_mount(dvp), nid, NULL, &vp, cnp, IFTOVT(fattr->mode)); vfs_unbusy(mp); vn_lock(dvp, ltype | LK_RETRY); if ((dvp->v_iflag & VI_DOOMED) != 0) { if (err == 0) vput(vp); err = ENOENT; } if (err) goto out; *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { err = fuse_vnode_get(vnode_mount(dvp), nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } fuse_vnode_setparent(vp, dvp); *vpp = vp; } if (op == FUSE_GETATTR) { cache_attrs(*vpp, (struct fuse_attr_out *)fdi.answ); } else { cache_attrs(*vpp, (struct fuse_entry_out *)fdi.answ); } /* Insert name into cache if appropriate. */ /* * Nooo, caching is evil. With caching, we can't avoid stale * information taking over the playground (cached info is not * just positive/negative, it does have qualitative aspects, * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when * walking down along cached path components, and that's not * any cheaper than FUSE_LOOKUP. This might change with * implementing kernel side attr caching, but... In Linux, * lookup results are not cached, and the daemon is bombarded * with FUSE_LOOKUPS on and on. This shows that by design, the * daemon is expected to handle frequent lookup queries * efficiently, do its caching in userspace, and so on. * * So just leave the name cache alone. */ /* * Well, now I know, Linux caches lookups, but with a * timeout... So it's the same thing as attribute caching: * we can deal with it when implement timeouts. */ #if 0 if (cnp->cn_flags & MAKEENTRY) { cache_enter(dvp, *vpp, cnp); } #endif } out: if (!lookup_err) { /* No lookup error; need to clean up. */ if (err) { /* Found inode; exit with no vnode. */ if (op == FUSE_LOOKUP) { fuse_internal_forget_send(vnode_mount(dvp), td, cred, nid, 1); } fdisp_destroy(&fdi); return err; } else { #ifndef NO_EARLY_PERM_CHECK_HACK if (!islastcn) { /* * We have the attributes of the next item * *now*, and it's a fact, and we do not * have to do extra work for it (ie, beg the * daemon), and it neither depends on such * accidental things like attr caching. So * the big idea: check credentials *now*, * not at the beginning of the next call to * lookup. * * The first item of the lookup chain (fs root) * won't be checked then here, of course, as * its never "the next". But go and see that * the root is taken care about at the very * beginning of this function. * * Now, given we want to do the access check * this way, one might ask: so then why not * do the access check just after fetching * the inode and its attributes from the * daemon? Why bother with producing the * corresponding vnode at all if something * is not OK? We know what's the deal as * soon as we get those attrs... There is * one bit of info though not given us by * the daemon: whether his response is * authoritative or not... His response should * be ignored if something is mounted over * the dir in question. But that can be * known only by having the vnode... */ int tmpvtype = vnode_vtype(*vpp); bzero(&facp, sizeof(facp)); /*the early perm check hack */ facp.facc_flags |= FACCESS_VA_VALID; if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) { err = ENOTDIR; } if (!err && !vnode_mountedhere(*vpp)) { err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred); } if (err) { if (tmpvtype == VLNK) FS_DEBUG("weird, permission error with a symlink?\n"); vput(*vpp); *vpp = NULL; } } #endif } } fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; fuse_trace_printf_vnop(); if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { return (EINVAL); } /* struct vnop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; fufh_type_t fufh_type; struct fuse_vnode_data *fvdat; int error, isdir = 0; int32_t fuse_open_flags; FS_DEBUG2G("inode=%ju mode=0x%x\n", (uintmax_t)VTOI(vp), mode); if (fuse_isdeadfs(vp)) { return ENXIO; } fvdat = VTOFUD(vp); if (vnode_isdir(vp)) { isdir = 1; } fuse_open_flags = 0; if (isdir) { fufh_type = FUFH_RDONLY; } else { fufh_type = fuse_filehandle_xlate_from_fflags(mode); /* * For WRONLY opens, force DIRECT_IO. This is necessary * since writing a partial block through the buffer cache * will result in a read of the block and that read won't * be allowed by the WRONLY open. */ if (fufh_type == FUFH_WRONLY || (fvdat->flag & FN_DIRECTIO) != 0) fuse_open_flags = FOPEN_DIRECT_IO; } if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) { fuse_vnode_open(vp, fuse_open_flags, td); return 0; } error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred); return error; } /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; FS_DEBUG2G("inode=%ju offset=%jd resid=%zd\n", (uintmax_t)VTOI(vp), uio->uio_offset, uio->uio_resid); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct fuse_vnode_data *fvdat; struct fuse_iov cookediov; int err = 0; int freefufh = 0; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } fvdat = VTOFUD(vp); if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) { FS_DEBUG("calling readdir() before open()"); err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred); freefufh = 1; } else { err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh); } if (err) { return (err); } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, fufh, &cookediov); fiov_teardown(&cookediov); if (freefufh) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } FS_DEBUG("inode=%ju\n", (uintmax_t)VTOI(vp)); for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid", type); fuse_filehandle_close(vp, type, td, NULL); } } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } fuse_vnode_setparent(vp, NULL); cache_purge(vp); vfs_hash_remove(vp); vnode_destroy_vobject(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; FS_DEBUG2G("inode=%ju name=%*s\n", (uintmax_t)VTOI(vp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } cache_purge(vp); err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; int err = 0; FS_DEBUG2G("from: inode=%ju name=%*s -> to: inode=%ju name=%*s\n", (uintmax_t)VTOI(fvp), (int)fcnp->cn_namelen, fcnp->cn_nameptr, (uintmax_t)(tvp == NULL ? -1 : VTOI(tvp)), (int)tcnp->cn_namelen, tcnp->cn_nameptr); if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { FS_DEBUG("cross-device rename: %s -> %s\n", fcnp->cn_nameptr, (tcnp != NULL ? tcnp->cn_nameptr : "(NULL)")); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ data = fuse_get_mpdata(vnode_mount(tdvp)); sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if ((tvp != NULL) && vnode_isdir(tvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_dispatcher fdi; struct fuse_setattr_in *fsai; struct fuse_access_param facp; int err = 0; enum vtype vtyp; int sizechanged = 0; uint64_t newsize = 0; FS_DEBUG2G("inode=%ju\n", (uintmax_t)VTOI(vp)); if (fuse_isdeadfs(vp)) { return ENXIO; } fdisp_init(&fdi, sizeof(*fsai)); fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); fsai = fdi.indata; fsai->valid = 0; bzero(&facp, sizeof(facp)); facp.xuid = vap->va_uid; facp.xgid = vap->va_gid; if (vap->va_uid != (uid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->uid = vap->va_uid; fsai->valid |= FATTR_UID; } if (vap->va_gid != (gid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->gid = vap->va_gid; fsai->valid |= FATTR_GID; } if (vap->va_size != VNOVAL) { struct fuse_filehandle *fufh = NULL; /*Truncate to a new value. */ fsai->size = vap->va_size; sizechanged = 1; newsize = vap->va_size; fsai->valid |= FATTR_SIZE; fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; } } if (vap->va_atime.tv_sec != VNOVAL) { fsai->atime = vap->va_atime.tv_sec; fsai->atimensec = vap->va_atime.tv_nsec; fsai->valid |= FATTR_ATIME; } if (vap->va_mtime.tv_sec != VNOVAL) { fsai->mtime = vap->va_mtime.tv_sec; fsai->mtimensec = vap->va_mtime.tv_nsec; fsai->valid |= FATTR_MTIME; } if (vap->va_mode != (mode_t)VNOVAL) { fsai->mode = vap->va_mode & ALLPERMS; fsai->valid |= FATTR_MODE; } if (!fsai->valid) { goto out; } vtyp = vnode_vtype(vp); if (fsai->valid & FATTR_SIZE && vtyp == VDIR) { err = EISDIR; goto out; } if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) { err = EROFS; goto out; } if (fsai->valid & ~FATTR_SIZE) { /*err = fuse_internal_access(vp, VADMIN, context, &facp); */ /*XXX */ err = 0; } facp.facc_flags &= ~FACCESS_XQUERIES; if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) && vap->va_vaflags & VA_UTIMES_NULL) { err = fuse_internal_access(vp, VWRITE, &facp, td, cred); } if (err) goto out; if ((err = fdisp_wait_answ(&fdi))) goto out; vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); if (vnode_vtype(vp) != vtyp) { if (vnode_vtype(vp) == VNON && vtyp != VNON) { debug_printf("FUSE: Dang! vnode_vtype is VNON and vtype isn't.\n"); } else { /* * STALE vnode, ditch * * The vnode has changed its type "behind our back". There's * nothing really we can do, so let us just force an internal * revocation and tell the caller to try again, if interested. */ fuse_internal_vnode_disappear(vp); err = EAGAIN; } } if (!err && !sizechanged) { cache_attrs(vp, (struct fuse_attr_out *)fdi.answ); } out: fdisp_destroy(&fdi); if (!err && sizechanged) { fuse_vnode_setsize(vp, cred, newsize); VTOFUD(vp)->flag &= ~FN_SIZECHANGE; } return err; } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; fuse_trace_printf_vnop(); if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return ENXIO; } if (bp->b_iocmd == BIO_WRITE) fuse_vnode_refreshsize(vp, NOCRED); (void)fuse_io_strategy(vp, bp); /* * This is a dangerous function. If returns error, that might mean a * panic. We prefer pretty much anything over being forced to panic * by a malicious daemon (a demon?). So we just return 0 anyway. You * should never mind this: this function has its own error * propagation mechanism via the argument buffer, so * not-that-melodramatic residents of the call chain still will be * able to know what to do. */ return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; FS_DEBUG2G("inode=%ju name=%*s\n", (uintmax_t)VTOI(dvp), (int)cnp->cn_namelen, cnp->cn_nameptr); if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; fuse_trace_printf_vnop(); if (fuse_isdeadfs(vp)) { return ENXIO; } fuse_vnode_refreshsize(vp, cred); if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; FS_DEBUG2G("heh\n"); vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to getpages")); td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; npages = ap->a_count; if (!fsess_opt_mmap(vnode_mount(vp))) { FS_DEBUG("called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } /* * If the last page is partially valid, just return it and allow * the pager to zero-out the blanks. Partially valid pages can * only occur at the file EOF. * * XXXGL: is that true for FUSE, which is a local filesystem, * but still somewhat disconnected from the kernel? */ VM_OBJECT_WLOCK(vp->v_object); if (pages[npages - 1]->valid != 0 && --npages == 0) goto out; VM_OBJECT_WUNLOCK(vp->v_object); /* * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ bp = getpbuf(&fuse_pbuf_freecnt); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, npages); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, npages); count = npages << PAGE_SHIFT; iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); relpbuf(bp, &fuse_pbuf_freecnt); if (error && (uio.uio_resid == count)) { FS_DEBUG("error %d\n", error); return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; VM_OBJECT_WLOCK(vp->v_object); fuse_vm_page_lock_queues(); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error occurred * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } } fuse_vm_page_unlock_queues(); out: VM_OBJECT_WUNLOCK(vp->v_object); if (ap->a_rbehind) *ap->a_rbehind = 0; if (ap->a_rahead) *ap->a_rahead = 0; return (VM_PAGER_OK); } /* struct vnop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; }; */ static int fuse_vnop_putpages(struct vop_putpages_args *ap) { struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; vm_ooffset_t fsize; FS_DEBUG2G("heh\n"); vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to putpages")); fsize = vp->v_object->un_pager.vnp.vnp_size; td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); if (!fsess_opt_mmap(vnode_mount(vp))) { FS_DEBUG("called on non-cacheable vnode??\n"); } for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_AGAIN; /* * When putting pages, do not extend file past EOF. */ if (offset + count > fsize) { count = fsize - offset; if (count < 0) count = 0; } /* * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ bp = getpbuf(&fuse_pbuf_freecnt); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, count); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, count); iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); relpbuf(bp, &fuse_pbuf_freecnt); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; VM_OBJECT_WLOCK(pages[i]->object); vm_page_undirty(pages[i]); VM_OBJECT_WUNLOCK(pages[i]->object); } } return rtvals[0]; } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } Index: head/sys/fs/nfsclient/nfs_clbio.c =================================================================== --- head/sys/fs/nfsclient/nfs_clbio.c (revision 317060) +++ head/sys/fs/nfsclient/nfs_clbio.c (revision 317061) @@ -1,1896 +1,1896 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int newnfs_directio_allow_mmap; extern struct nfsstatsv1 nfsstatsv1; extern struct mtx ncl_iod_mutex; extern int ncl_numasync; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern int newnfs_directio_enable; extern int nfs_keep_dirty_on_error; int ncl_pbuf_freecnt = -1; /* start out unlimited */ static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td); static int nfs_directio_write(struct vnode *vp, struct uio *uiop, struct ucred *cred, int ioflag); /* * Vnode op for VM getpages. */ SYSCTL_DECL(_vfs_nfs); static int use_buf_pager = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Use buffer pager instead of direct readrpc call"); static daddr_t ncl_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { return (off / vp->v_bufobj.bo_bsize); } static int ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn) { struct nfsnode *np; u_quad_t nsize; int biosize, bcount; np = VTONFS(vp); mtx_lock(&np->n_mtx); nsize = np->n_size; mtx_unlock(&np->n_mtx); biosize = vp->v_bufobj.bo_bsize; bcount = biosize; if ((off_t)lbn * biosize >= nsize) bcount = 0; else if ((off_t)(lbn + 1) * biosize > nsize) bcount = nsize - (off_t)lbn * biosize; return (bcount); } int ncl_getpages(struct vop_getpages_args *ap) { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; struct nfsmount *nmp; vm_object_t object; vm_page_t *pages; struct nfsnode *np; vp = ap->a_vp; np = VTONFS(vp); td = curthread; cred = curthread->td_ucred; nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; npages = ap->a_count; if ((object = vp->v_object) == NULL) { printf("ncl_getpages: called with non-merged cache vnode\n"); return (VM_PAGER_ERROR); } if (newnfs_directio_enable && !newnfs_directio_allow_mmap) { mtx_lock(&np->n_mtx); if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { mtx_unlock(&np->n_mtx); printf("ncl_getpages: called on non-cacheable vnode\n"); return (VM_PAGER_ERROR); } else mtx_unlock(&np->n_mtx); } mtx_lock(&nmp->nm_mtx); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { mtx_unlock(&nmp->nm_mtx); /* We'll never get here for v4, because we always have fsinfo */ (void)ncl_fsinfo(nmp, vp, cred, td); } else mtx_unlock(&nmp->nm_mtx); if (use_buf_pager) return (vfs_bio_getpages(vp, pages, npages, ap->a_rbehind, ap->a_rahead, ncl_gbp_getblkno, ncl_gbp_getblksz)); /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. * * XXXGL: is that true for NFS, where short read can occur??? */ VM_OBJECT_WLOCK(object); if (pages[npages - 1]->valid != 0 && --npages == 0) goto out; VM_OBJECT_WUNLOCK(object); /* * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ bp = getpbuf(&ncl_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, npages); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, npages); count = npages << PAGE_SHIFT; iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = ncl_readrpc(vp, &uio, cred); pmap_qremove(kva, npages); relpbuf(bp, &ncl_pbuf_freecnt); if (error && (uio.uio_resid == count)) { printf("ncl_getpages: error %d\n", error); return (VM_PAGER_ERROR); } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; VM_OBJECT_WLOCK(object); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("nfs_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("nfs_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error * occurred we may have hit a zero-fill * section. We leave valid set to 0, and page * is freed by vm_page_readahead_finish() if * its index is not equal to requested, or * page is zeroed and set valid by * vm_pager_get_pages() for requested page. */ ; } } out: VM_OBJECT_WUNLOCK(object); if (ap->a_rbehind) *ap->a_rbehind = 0; if (ap->a_rahead) *ap->a_rahead = 0; return (VM_PAGER_OK); } /* * Vnode op for VM putpages. */ int ncl_putpages(struct vop_putpages_args *ap) { struct uio uio; struct iovec iov; int i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct thread *td; struct ucred *cred; struct nfsmount *nmp; struct nfsnode *np; vm_page_t *pages; vp = ap->a_vp; np = VTONFS(vp); td = curthread; /* XXX */ /* Set the cred to n_writecred for the write rpcs. */ if (np->n_writecred != NULL) cred = crhold(np->n_writecred); else cred = crhold(curthread->td_ucred); /* XXX */ nmp = VFSTONFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); mtx_lock(&nmp->nm_mtx); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { mtx_unlock(&nmp->nm_mtx); (void)ncl_fsinfo(nmp, vp, cred, td); } else mtx_unlock(&nmp->nm_mtx); mtx_lock(&np->n_mtx); if (newnfs_directio_enable && !newnfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { mtx_unlock(&np->n_mtx); printf("ncl_putpages: called on noncache-able vnode\n"); mtx_lock(&np->n_mtx); } for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_ERROR; /* * When putting pages, do not extend file past EOF. */ if (offset + count > np->n_size) { count = np->n_size - offset; if (count < 0) count = 0; } mtx_unlock(&np->n_mtx); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, count); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, count); iov.iov_base = unmapped_buf; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_NOCOPY; uio.uio_rw = UIO_WRITE; uio.uio_td = td; error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync), cred); crfree(cred); if (error == 0 || !nfs_keep_dirty_on_error) vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid); return (rtvals[0]); } /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is * believed to be compatible with the reference port. * For nfs: * If the file's modify time on the server has changed since the * last read rpc or you have written to the file, * you may have lost data cache consistency with the * server, so flush all of the file's data out of the cache. * Then force a getattr rpc to ensure that you have up to date * attributes. * NB: This implies that cache data can be read when up to * NFS_ATTRTIMEO seconds out of date. If you find that you need current * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ static inline int nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) { int error = 0; struct vattr vattr; struct nfsnode *np = VTONFS(vp); int old_lock; /* * Grab the exclusive lock before checking whether the cache is * consistent. * XXX - We can make this cheaper later (by acquiring cheaper locks). * But for now, this suffices. */ old_lock = ncl_upgrade_vnlock(vp); if (vp->v_iflag & VI_DOOMED) { error = EBADF; goto out; } mtx_lock(&np->n_mtx); if (np->n_flag & NMODIFIED) { mtx_unlock(&np->n_mtx); if (vp->v_type != VREG) { if (vp->v_type != VDIR) panic("nfs: bioread, not dir"); ncl_invaldir(vp); error = ncl_vinvalbuf(vp, V_SAVE, td, 1); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = EBADF; if (error != 0) goto out; } np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); error = VOP_GETATTR(vp, &vattr, cred); if (error) goto out; mtx_lock(&np->n_mtx); np->n_mtime = vattr.va_mtime; mtx_unlock(&np->n_mtx); } else { mtx_unlock(&np->n_mtx); error = VOP_GETATTR(vp, &vattr, cred); if (error) return (error); mtx_lock(&np->n_mtx); if ((np->n_flag & NSIZECHANGED) || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { mtx_unlock(&np->n_mtx); if (vp->v_type == VDIR) ncl_invaldir(vp); error = ncl_vinvalbuf(vp, V_SAVE, td, 1); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = EBADF; if (error != 0) goto out; mtx_lock(&np->n_mtx); np->n_mtime = vattr.va_mtime; np->n_flag &= ~NSIZECHANGED; } mtx_unlock(&np->n_mtx); } out: ncl_downgrade_vnlock(vp, old_lock); return (error); } /* * Vnode op for read using bio */ int ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) { struct nfsnode *np = VTONFS(vp); int biosize, i; struct buf *bp, *rabp; struct thread *td; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn, rabn; int bcount; int seqcount; int nra, error = 0, n = 0, on = 0; off_t tmp_off; KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode")); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ return (EINVAL); td = uio->uio_td; mtx_lock(&nmp->nm_mtx); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { mtx_unlock(&nmp->nm_mtx); (void)ncl_fsinfo(nmp, vp, cred, td); mtx_lock(&nmp->nm_mtx); } if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0) (void) newnfs_iosize(nmp); tmp_off = uio->uio_offset + uio->uio_resid; if (vp->v_type != VDIR && (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) { mtx_unlock(&nmp->nm_mtx); return (EFBIG); } mtx_unlock(&nmp->nm_mtx); if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) /* No caching/ no readaheads. Just read data into the user buffer */ return ncl_readrpc(vp, uio, cred); biosize = vp->v_bufobj.bo_bsize; seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); error = nfs_bioread_check_cons(vp, td, cred); if (error) return error; do { u_quad_t nsize; mtx_lock(&np->n_mtx); nsize = np->n_size; mtx_unlock(&np->n_mtx); switch (vp->v_type) { case VREG: NFSINCRGLOBAL(nfsstatsv1.biocache_reads); lbn = uio->uio_offset / biosize; on = uio->uio_offset - (lbn * biosize); /* * Start the read ahead(s), as required. */ if (nmp->nm_readahead > 0) { for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) { rabn = lbn + 1 + nra; if (incore(&vp->v_bufobj, rabn) == NULL) { rabp = nfs_getcacheblk(vp, rabn, biosize, td); if (!rabp) { error = newnfs_sigintr(nmp, td); return (error ? error : EINTR); } if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (ncl_asyncio(nmp, rabp, cred, td)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); break; } } else { brelse(rabp); } } } } /* Note that bcount is *not* DEV_BSIZE aligned. */ bcount = biosize; if ((off_t)lbn * biosize >= nsize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > nsize) { bcount = nsize - (off_t)lbn * biosize; } bp = nfs_getcacheblk(vp, lbn, bcount, td); if (!bp) { error = newnfs_sigintr(nmp, td); return (error ? error : EINTR); } /* * If B_CACHE is not set, we must issue the read. If this * fails, we return an error. */ if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = ncl_doio(vp, bp, cred, td, 0); if (error) { brelse(bp); return (error); } } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount) n = MIN((unsigned)(bcount - on), uio->uio_resid); break; case VLNK: NFSINCRGLOBAL(nfsstatsv1.biocache_readlinks); bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); if (!bp) { error = newnfs_sigintr(nmp, td); return (error ? error : EINTR); } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = ncl_doio(vp, bp, cred, td, 0); if (error) { bp->b_ioflags |= BIO_ERROR; brelse(bp); return (error); } } n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); on = 0; break; case VDIR: NFSINCRGLOBAL(nfsstatsv1.biocache_readdirs); if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) { return (0); } lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); if (!bp) { error = newnfs_sigintr(nmp, td); return (error ? error : EINTR); } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = ncl_doio(vp, bp, cred, td, 0); if (error) { brelse(bp); } while (error == NFSERR_BAD_COOKIE) { ncl_invaldir(vp); error = ncl_vinvalbuf(vp, 0, td, 1); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) return (EBADF); /* * Yuck! The directory has been modified on the * server. The only way to get the block is by * reading from the beginning to get all the * offset cookies. * * Leave the last bp intact unless there is an error. * Loop back up to the while if the error is another * NFSERR_BAD_COOKIE (double yuch!). */ for (i = 0; i <= lbn && !error; i++) { if (np->n_direofoffset && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) return (0); bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); if (!bp) { error = newnfs_sigintr(nmp, td); return (error ? error : EINTR); } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = ncl_doio(vp, bp, cred, td, 0); /* * no error + B_INVAL == directory EOF, * use the block. */ if (error == 0 && (bp->b_flags & B_INVAL)) break; } /* * An error will throw away the block and the * for loop will break out. If no error and this * is not the block we want, we throw away the * block and go for the next one via the for loop. */ if (error || i < lbn) brelse(bp); } } /* * The above while is repeated if we hit another cookie * error. If we hit an error and it wasn't a cookie error, * we give up. */ if (error) return (error); } /* * If not eof and read aheads are enabled, start one. * (You need the current block first, so that you have the * directory offset cookie of the next block.) */ if (nmp->nm_readahead > 0 && (bp->b_flags & B_INVAL) == 0 && (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && incore(&vp->v_bufobj, lbn + 1) == NULL) { rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { rabp->b_flags |= B_ASYNC; rabp->b_iocmd = BIO_READ; vfs_busy_pages(rabp, 0); if (ncl_asyncio(nmp, rabp, cred, td)) { rabp->b_flags |= B_INVAL; rabp->b_ioflags |= BIO_ERROR; vfs_unbusy_pages(rabp); brelse(rabp); } } else { brelse(rabp); } } } /* * Unlike VREG files, whos buffer size ( bp->b_bcount ) is * chopped for the EOF condition, we cannot tell how large * NFS directories are going to be until we hit EOF. So * an NFS directory buffer is *not* chopped to its EOF. Now, * it just so happens that b_resid will effectively chop it * to EOF. *BUT* this information is lost if the buffer goes * away and is reconstituted into a B_CACHE state ( due to * being VMIO ) later. So we keep track of the directory eof * in np->n_direofoffset and chop it off as an extra step * right here. */ n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) n = np->n_direofoffset - uio->uio_offset; break; default: printf(" ncl_bioread: type %x unexpected\n", vp->v_type); bp = NULL; break; } if (n > 0) { error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio); } if (vp->v_type == VLNK) n = 0; if (bp != NULL) brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n > 0); return (error); } /* * The NFS write path cannot handle iovecs with len > 1. So we need to * break up iovecs accordingly (restricting them to wsize). * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). * For the ASYNC case, 2 copies are needed. The first a copy from the * user buffer to a staging buffer and then a second copy from the staging * buffer to mbufs. This can be optimized by copying from the user buffer * directly into mbufs and passing the chain down, but that requires a * fair amount of re-working of the relevant codepaths (and can be done * later). */ static int nfs_directio_write(vp, uiop, cred, ioflag) struct vnode *vp; struct uio *uiop; struct ucred *cred; int ioflag; { int error; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct thread *td = uiop->uio_td; int size; int wsize; mtx_lock(&nmp->nm_mtx); wsize = nmp->nm_wsize; mtx_unlock(&nmp->nm_mtx); if (ioflag & IO_SYNC) { int iomode, must_commit; struct uio uio; struct iovec iov; do_sync: while (uiop->uio_resid > 0) { size = MIN(uiop->uio_resid, wsize); size = MIN(uiop->uio_iov->iov_len, size); iov.iov_base = uiop->uio_iov->iov_base; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = uiop->uio_offset; uio.uio_resid = size; uio.uio_segflg = UIO_USERSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; iomode = NFSWRITE_FILESYNC; error = ncl_writerpc(vp, &uio, cred, &iomode, &must_commit, 0); KASSERT((must_commit == 0), ("ncl_directio_write: Did not commit write")); if (error) return (error); uiop->uio_offset += size; uiop->uio_resid -= size; if (uiop->uio_iov->iov_len <= size) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + size; uiop->uio_iov->iov_len -= size; } } } else { struct uio *t_uio; struct iovec *t_iov; struct buf *bp; /* * Break up the write into blocksize chunks and hand these * over to nfsiod's for write back. * Unfortunately, this incurs a copy of the data. Since * the user could modify the buffer before the write is * initiated. * * The obvious optimization here is that one of the 2 copies * in the async write path can be eliminated by copying the * data here directly into mbufs and passing the mbuf chain * down. But that will require a fair amount of re-working * of the code and can be done if there's enough interest * in NFS directio access. */ while (uiop->uio_resid > 0) { size = MIN(uiop->uio_resid, wsize); size = MIN(uiop->uio_iov->iov_len, size); bp = getpbuf(&ncl_pbuf_freecnt); t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); t_iov->iov_len = size; t_uio->uio_iov = t_iov; t_uio->uio_iovcnt = 1; t_uio->uio_offset = uiop->uio_offset; t_uio->uio_resid = size; t_uio->uio_segflg = UIO_SYSSPACE; t_uio->uio_rw = UIO_WRITE; t_uio->uio_td = td; KASSERT(uiop->uio_segflg == UIO_USERSPACE || uiop->uio_segflg == UIO_SYSSPACE, ("nfs_directio_write: Bad uio_segflg")); if (uiop->uio_segflg == UIO_USERSPACE) { error = copyin(uiop->uio_iov->iov_base, t_iov->iov_base, size); if (error != 0) goto err_free; } else /* * UIO_SYSSPACE may never happen, but handle * it just in case it does. */ bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); bp->b_flags |= B_DIRECT; bp->b_iocmd = BIO_WRITE; if (cred != NOCRED) { crhold(cred); bp->b_wcred = cred; } else bp->b_wcred = NOCRED; bp->b_caller1 = (void *)t_uio; bp->b_vp = vp; error = ncl_asyncio(nmp, bp, NOCRED, td); err_free: if (error) { free(t_iov->iov_base, M_NFSDIRECTIO); free(t_iov, M_NFSDIRECTIO); free(t_uio, M_NFSDIRECTIO); bp->b_vp = NULL; relpbuf(bp, &ncl_pbuf_freecnt); if (error == EINTR) return (error); goto do_sync; } uiop->uio_offset += size; uiop->uio_resid -= size; if (uiop->uio_iov->iov_len <= size) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + size; uiop->uio_iov->iov_len -= size; } } } return (0); } /* * Vnode op for write using bio */ int ncl_write(struct vop_write_args *ap) { int biosize; struct uio *uio = ap->a_uio; struct thread *td = uio->uio_td; struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; int bcount, noncontig_write, obcount; int bp_cached, n, on, error = 0, error1, wouldcommit; size_t orig_resid, local_resid; off_t orig_size, tmp_off; KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("ncl_write proc")); if (vp->v_type != VREG) return (EIO); mtx_lock(&np->n_mtx); if (np->n_flag & NWRITEERR) { np->n_flag &= ~NWRITEERR; mtx_unlock(&np->n_mtx); return (np->n_error); } else mtx_unlock(&np->n_mtx); mtx_lock(&nmp->nm_mtx); if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { mtx_unlock(&nmp->nm_mtx); (void)ncl_fsinfo(nmp, vp, cred, td); mtx_lock(&nmp->nm_mtx); } if (nmp->nm_wsize == 0) (void) newnfs_iosize(nmp); mtx_unlock(&nmp->nm_mtx); /* * Synchronously flush pending buffers if we are in synchronous * mode or if we are appending. */ if (ioflag & (IO_APPEND | IO_SYNC)) { mtx_lock(&np->n_mtx); if (np->n_flag & NMODIFIED) { mtx_unlock(&np->n_mtx); #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ /* * Require non-blocking, synchronous writes to * dirty files to inform the program it needs * to fsync(2) explicitly. */ if (ioflag & IO_NDELAY) return (EAGAIN); #endif np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag & IO_VMIO) != 0 ? V_VMIO : 0), td, 1); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = EBADF; if (error != 0) return (error); } else mtx_unlock(&np->n_mtx); } orig_resid = uio->uio_resid; mtx_lock(&np->n_mtx); orig_size = np->n_size; mtx_unlock(&np->n_mtx); /* * If IO_APPEND then load uio_offset. We restart here if we cannot * get the append lock. */ if (ioflag & IO_APPEND) { np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); error = VOP_GETATTR(vp, &vattr, cred); if (error) return (error); mtx_lock(&np->n_mtx); uio->uio_offset = np->n_size; mtx_unlock(&np->n_mtx); } if (uio->uio_offset < 0) return (EINVAL); tmp_off = uio->uio_offset + uio->uio_resid; if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset) return (EFBIG); if (uio->uio_resid == 0) return (0); if (newnfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) return nfs_directio_write(vp, uio, cred, ioflag); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, i don't think it matters */ if (vn_rlimit_fsize(vp, uio, td)) return (EFBIG); biosize = vp->v_bufobj.bo_bsize; /* * Find all of this file's B_NEEDCOMMIT buffers. If our writes * would exceed the local maximum per-file write commit size when * combined with those, we must decide whether to flush, * go synchronous, or return error. We don't bother checking * IO_UNIT -- we just make all writes atomic anyway, as there's * no point optimizing for something that really won't ever happen. */ wouldcommit = 0; if (!(ioflag & IO_SYNC)) { int nflag; mtx_lock(&np->n_mtx); nflag = np->n_flag; mtx_unlock(&np->n_mtx); if (nflag & NMODIFIED) { BO_LOCK(&vp->v_bufobj); if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { if (bp->b_flags & B_NEEDCOMMIT) wouldcommit += bp->b_bcount; } } BO_UNLOCK(&vp->v_bufobj); } } do { if (!(ioflag & IO_SYNC)) { wouldcommit += biosize; if (wouldcommit > nmp->nm_wcommitsize) { np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag & IO_VMIO) != 0 ? V_VMIO : 0), td, 1); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = EBADF; if (error != 0) return (error); wouldcommit = biosize; } } NFSINCRGLOBAL(nfsstatsv1.biocache_writes); lbn = uio->uio_offset / biosize; on = uio->uio_offset - (lbn * biosize); n = MIN((unsigned)(biosize - on), uio->uio_resid); again: /* * Handle direct append and file extension cases, calculate * unaligned buffer size. */ mtx_lock(&np->n_mtx); if ((np->n_flag & NHASBEENLOCKED) == 0 && (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0) noncontig_write = 1; else noncontig_write = 0; if ((uio->uio_offset == np->n_size || (noncontig_write != 0 && lbn == (np->n_size / biosize) && uio->uio_offset + n > np->n_size)) && n) { mtx_unlock(&np->n_mtx); /* * Get the buffer (in its pre-append state to maintain * B_CACHE if it was previously set). Resize the * nfsnode after we have locked the buffer to prevent * readers from reading garbage. */ obcount = np->n_size - (lbn * biosize); bp = nfs_getcacheblk(vp, lbn, obcount, td); if (bp != NULL) { long save; mtx_lock(&np->n_mtx); np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); mtx_unlock(&np->n_mtx); save = bp->b_flags & B_CACHE; bcount = on + n; allocbuf(bp, bcount); bp->b_flags |= save; if (noncontig_write != 0 && on > obcount) vfs_bio_bzero_buf(bp, obcount, on - obcount); } } else { /* * Obtain the locked cache block first, and then * adjust the file's size as appropriate. */ bcount = on + n; if ((off_t)lbn * biosize + bcount < np->n_size) { if ((off_t)(lbn + 1) * biosize < np->n_size) bcount = biosize; else bcount = np->n_size - (off_t)lbn * biosize; } mtx_unlock(&np->n_mtx); bp = nfs_getcacheblk(vp, lbn, bcount, td); mtx_lock(&np->n_mtx); if (uio->uio_offset + n > np->n_size) { np->n_size = uio->uio_offset + n; np->n_flag |= NMODIFIED; vnode_pager_setsize(vp, np->n_size); } mtx_unlock(&np->n_mtx); } if (!bp) { error = newnfs_sigintr(nmp, td); if (!error) error = EINTR; break; } /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ bp_cached = 1; if (on == 0 && n == bcount) { if ((bp->b_flags & B_CACHE) == 0) bp_cached = 0; bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); error = ncl_doio(vp, bp, cred, td, 0); if (error) { brelse(bp); break; } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); mtx_lock(&np->n_mtx); np->n_flag |= NMODIFIED; mtx_unlock(&np->n_mtx); /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { printf("NFS append race @%lx:%d\n", (long)bp->b_blkno * DEV_BSIZE, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * If there has been a file lock applied to this file * or vfs.nfs.old_noncontig_writing is set, do the following: * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * If vfs.nfs.old_noncontig_writing is not set and there has * not been file locking done on this file: * Relax coherency a bit for the sake of performance and * expand the current dirty region to contain the new * write even if it means we mark some non-dirty data as * dirty. */ if (noncontig_write == 0 && bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { if (bwrite(bp) == EINTR) { error = EINTR; break; } goto again; } local_resid = uio->uio_resid; error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio); if (error != 0 && !bp_cached) { /* * This block has no other content then what * possibly was written by the faulty uiomove. * Release it, forgetting the data pages, to * prevent the leak of uninitialized data to * usermode. */ bp->b_ioflags |= BIO_ERROR; brelse(bp); uio->uio_offset -= local_resid - uio->uio_resid; uio->uio_resid = local_resid; break; } /* * Since this block is being modified, it must be written * again and not just committed. Since write clustering does * not work for the stage 1 data write, only the stage 2 * commit rpc, we have to clear B_CLUSTEROK as well. */ bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); /* * Get the partial update on the progress made from * uiomove, if an error occurred. */ if (error != 0) n = local_resid - uio->uio_resid; /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n > 0) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = min(on, bp->b_dirtyoff); bp->b_dirtyend = max((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_valid(bp, on, n); } /* * If IO_SYNC do bwrite(). * * IO_INVAL appears to be unused. The idea appears to be * to turn off caching in this case. Very odd. XXX */ if ((ioflag & IO_SYNC)) { if (ioflag & IO_INVAL) bp->b_flags |= B_NOCACHE; error1 = bwrite(bp); if (error1 != 0) { if (error == 0) error = error1; break; } } else if ((n + on) == biosize || (ioflag & IO_ASYNC) != 0) { bp->b_flags |= B_ASYNC; (void) ncl_writebp(bp, 0, NULL); } else { bdwrite(bp); } if (error != 0) break; } while (uio->uio_resid > 0 && n > 0); if (error != 0) { if (ioflag & IO_UNIT) { VATTR_NULL(&vattr); vattr.va_size = orig_size; /* IO_SYNC is handled implicitely */ (void)VOP_SETATTR(vp, &vattr, cred); uio->uio_offset -= orig_resid - uio->uio_resid; uio->uio_resid = orig_resid; } } return (error); } /* * Get an nfs cache block. * * Allocate a new one if the block isn't currently in the cache * and return the block marked busy. If the calling process is * interrupted by a signal for an interruptible mount point, return * NULL. * * The caller must carefully deal with the possible B_INVAL state of * the buffer. ncl_doio() clears B_INVAL (and ncl_asyncio() clears it * indirectly), so synchronous reads can be issued without worrying about * the B_INVAL state. We have to be a little more careful when dealing * with writes (see comments in nfs_write()) when extending a file past * its EOF. */ static struct buf * nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) { struct buf *bp; struct mount *mp; struct nfsmount *nmp; mp = vp->v_mount; nmp = VFSTONFS(mp); if (nmp->nm_flag & NFSMNT_INT) { sigset_t oldset; newnfs_set_sigmask(td, &oldset); bp = getblk(vp, bn, size, PCATCH, 0, 0); newnfs_restore_sigmask(td, &oldset); while (bp == NULL) { if (newnfs_sigintr(nmp, td)) return (NULL); bp = getblk(vp, bn, size, 0, 2 * hz, 0); } } else { bp = getblk(vp, bn, size, 0, 0, 0); } if (vp->v_type == VREG) bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE); return (bp); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) { struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); int error = 0, slpflag, slptimeo; int old_lock = 0; ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf"); if ((nmp->nm_flag & NFSMNT_INT) == 0) intrflg = 0; if ((nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)) intrflg = 1; if (intrflg) { slpflag = PCATCH; slptimeo = 2 * hz; } else { slpflag = 0; slptimeo = 0; } old_lock = ncl_upgrade_vnlock(vp); if (vp->v_iflag & VI_DOOMED) { /* * Since vgonel() uses the generic vinvalbuf() to flush * dirty buffers and it does not call this function, it * is safe to just return OK when VI_DOOMED is set. */ ncl_downgrade_vnlock(vp, old_lock); return (0); } /* * Now, flush as required. */ if ((flags & (V_SAVE | V_VMIO)) == V_SAVE && vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); /* * If the page clean was interrupted, fail the invalidation. * Not doing so, we run the risk of losing dirty pages in the * vinvalbuf() call below. */ if (intrflg && (error = newnfs_sigintr(nmp, td))) goto out; } error = vinvalbuf(vp, flags, slpflag, 0); while (error) { if (intrflg && (error = newnfs_sigintr(nmp, td))) goto out; error = vinvalbuf(vp, flags, 0, slptimeo); } if (NFSHASPNFS(nmp)) { nfscl_layoutcommit(vp, td); /* * Invalidate the attribute cache, since writes to a DS * won't update the size attribute. */ mtx_lock(&np->n_mtx); np->n_attrstamp = 0; } else mtx_lock(&np->n_mtx); if (np->n_directio_asyncwr == 0) np->n_flag &= ~NMODIFIED; mtx_unlock(&np->n_mtx); out: ncl_downgrade_vnlock(vp, old_lock); return error; } /* * Initiate asynchronous I/O. Return an error if no nfsiods are available. * This is mainly to avoid queueing async I/O requests when the nfsiods * are all hung on a dead server. * * Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp * is eventually dequeued by the async daemon, ncl_doio() *will*. */ int ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) { int iod; int gotiod; int slpflag = 0; int slptimeo = 0; int error, error2; /* * Commits are usually short and sweet so lets save some cpu and * leave the async daemons for more important rpc's (such as reads * and writes). * * Readdirplus RPCs do vget()s to acquire the vnodes for entries * in the directory in order to update attributes. This can deadlock * with another thread that is waiting for async I/O to be done by * an nfsiod thread while holding a lock on one of these vnodes. * To avoid this deadlock, don't allow the async nfsiod threads to * perform Readdirplus RPCs. */ mtx_lock(&ncl_iod_mutex); if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && (nmp->nm_bufqiods > ncl_numasync / 2)) || (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) { mtx_unlock(&ncl_iod_mutex); return(EIO); } again: if (nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; gotiod = FALSE; /* * Find a free iod to process this request. */ for (iod = 0; iod < ncl_numasync; iod++) if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) { gotiod = TRUE; break; } /* * Try to create one if none are free. */ if (!gotiod) ncl_nfsiodnew(); else { /* * Found one, so wake it up and tell it which * mount to process. */ NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n", iod, nmp)); ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE; ncl_iodmount[iod] = nmp; nmp->nm_bufqiods++; wakeup(&ncl_iodwant[iod]); } /* * If none are free, we may already have an iod working on this mount * point. If so, it will process our request. */ if (!gotiod) { if (nmp->nm_bufqiods > 0) { NFS_DPF(ASYNCIO, ("ncl_asyncio: %d iods are already processing mount %p\n", nmp->nm_bufqiods, nmp)); gotiod = TRUE; } } /* * If we have an iod which can process the request, then queue * the buffer. */ if (gotiod) { /* * Ensure that the queue never grows too large. We still want * to asynchronize so we block rather then return EIO. */ while (nmp->nm_bufqlen >= 2*ncl_numasync) { NFS_DPF(ASYNCIO, ("ncl_asyncio: waiting for mount %p queue to drain\n", nmp)); nmp->nm_bufqwant = TRUE; error = newnfs_msleep(td, &nmp->nm_bufq, &ncl_iod_mutex, slpflag | PRIBIO, "nfsaio", slptimeo); if (error) { error2 = newnfs_sigintr(nmp, td); if (error2) { mtx_unlock(&ncl_iod_mutex); return (error2); } if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* * We might have lost our iod while sleeping, * so check and loop if necessary. */ goto again; } /* We might have lost our nfsiod */ if (nmp->nm_bufqiods == 0) { NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); goto again; } if (bp->b_iocmd == BIO_READ) { if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); } else { if (bp->b_wcred == NOCRED && cred != NOCRED) bp->b_wcred = crhold(cred); } if (bp->b_flags & B_REMFREE) bremfreef(bp); BUF_KERNPROC(bp); TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); nmp->nm_bufqlen++; if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); VTONFS(bp->b_vp)->n_flag |= NMODIFIED; VTONFS(bp->b_vp)->n_directio_asyncwr++; mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); } mtx_unlock(&ncl_iod_mutex); return (0); } mtx_unlock(&ncl_iod_mutex); /* * All the iods are busy on other mounts, so return EIO to * force the caller to process the i/o synchronously. */ NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n")); return (EIO); } void ncl_doio_directwrite(struct buf *bp) { int iomode, must_commit; struct uio *uiop = (struct uio *)bp->b_caller1; char *iov_base = uiop->uio_iov->iov_base; iomode = NFSWRITE_FILESYNC; uiop->uio_td = NULL; /* NULL since we're in nfsiod */ ncl_writerpc(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit, 0); KASSERT((must_commit == 0), ("ncl_doio_directwrite: Did not commit write")); free(iov_base, M_NFSDIRECTIO); free(uiop->uio_iov, M_NFSDIRECTIO); free(uiop, M_NFSDIRECTIO); if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { struct nfsnode *np = VTONFS(bp->b_vp); mtx_lock(&np->n_mtx); if (NFSHASPNFS(VFSTONFS(vnode_mount(bp->b_vp)))) { /* * Invalidate the attribute cache, since writes to a DS * won't update the size attribute. */ np->n_attrstamp = 0; } np->n_directio_asyncwr--; if (np->n_directio_asyncwr == 0) { np->n_flag &= ~NMODIFIED; if ((np->n_flag & NFSYNCWAIT)) { np->n_flag &= ~NFSYNCWAIT; wakeup((caddr_t)&np->n_directio_asyncwr); } } mtx_unlock(&np->n_mtx); } bp->b_vp = NULL; relpbuf(bp, &ncl_pbuf_freecnt); } /* * Do an I/O operation to/from a cache block. This may be called * synchronously or from an nfsiod. */ int ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td, int called_from_strategy) { struct uio *uiop; struct nfsnode *np; struct nfsmount *nmp; int error = 0, iomode, must_commit = 0; struct uio uio; struct iovec io; struct proc *p = td ? td->td_proc : NULL; uint8_t iocmd; np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = td; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp)); iocmd = bp->b_iocmd; if (iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; NFSINCRGLOBAL(nfsstatsv1.read_bios); error = ncl_readrpc(vp, uiop, cr); if (!error) { if (uiop->uio_resid) { /* * If we had a short read with no error, we must have * hit a file hole. We should zero-fill the remainder. * This can also occur if the server hits the file EOF. * * Holes used to be able to occur due to pending * writes, but that is not possible any longer. */ int nread = bp->b_bcount - uiop->uio_resid; ssize_t left = uiop->uio_resid; if (left > 0) bzero((char *)bp->b_data + nread, left); uiop->uio_resid = 0; } } /* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */ if (p && (vp->v_vflag & VV_TEXT)) { mtx_lock(&np->n_mtx); if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) { mtx_unlock(&np->n_mtx); PROC_LOCK(p); killproc(p, "text file modification"); PROC_UNLOCK(p); } else mtx_unlock(&np->n_mtx); } break; case VLNK: uiop->uio_offset = (off_t)0; NFSINCRGLOBAL(nfsstatsv1.readlink_bios); error = ncl_readlinkrpc(vp, uiop, cr); break; case VDIR: NFSINCRGLOBAL(nfsstatsv1.readdir_bios); uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { error = ncl_readdirplusrpc(vp, uiop, cr, td); if (error == NFSERR_NOTSUPP) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) error = ncl_readdirrpc(vp, uiop, cr, td); /* * end-of-directory sets B_INVAL but does not generate an * error. */ if (error == 0 && uiop->uio_resid == bp->b_bcount) bp->b_flags |= B_INVAL; break; default: printf("ncl_doio: type %x unexpected\n", vp->v_type); break; } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * If we only need to commit, try to commit */ if (bp->b_flags & B_NEEDCOMMIT) { int retv; off_t off; off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff, bp->b_wcred, td); if (retv == 0) { bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); bp->b_resid = 0; bufdone(bp); return (0); } if (retv == NFSERR_STALEWRITEVERF) { ncl_clearcommit(vp->v_mount); } } /* * Setup for actual write */ mtx_lock(&np->n_mtx); if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; mtx_unlock(&np->n_mtx); if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; NFSINCRGLOBAL(nfsstatsv1.write_bios); if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) iomode = NFSWRITE_UNSTABLE; else iomode = NFSWRITE_FILESYNC; error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit, called_from_strategy); /* * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try * to cluster the buffers needing commit. This will allow * the system to submit a single commit rpc for the whole * cluster. We can do this even if the buffer is not 100% * dirty (relative to the NFS blocksize), so we optimize the * append-to-file-case. * * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be * cleared because write clustering only works for commit * rpc's, not for the data portion of the write). */ if (!error && iomode == NFSWRITE_UNSTABLE) { bp->b_flags |= B_NEEDCOMMIT; if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) bp->b_flags |= B_CLUSTEROK; } else { bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); } /* * For an interrupted write, the buffer is still valid * and the write hasn't been pushed to the server yet, * so we can't set BIO_ERROR and report the interruption * by setting B_EINTR. For the B_ASYNC case, B_EINTR * is not relevant, so the rpc attempt is essentially * a noop. For the case of a V3 write rpc not being * committed to stable storage, the block is still * dirty and requires either a commit rpc or another * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. * * EIO is returned by ncl_writerpc() to indicate a recoverable * write error and is handled as above, except that * B_EINTR isn't set. One cause of this is a stale stateid * error for the RPC that indicates recovery is required, * when called with called_from_strategy != 0. * * If the buffer is marked B_PAGING, it does not reside on * the vp's paging queues so we cannot call bdirty(). The * bp in this case is not an NFS cache block so we should * be safe. XXX * * The logic below breaks up errors into recoverable and * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE * and keep the buffer around for potential write retries. * For the latter (eg ESTALE), we toss the buffer away (B_INVAL) * and save the error in the nfsnode. This is less than ideal * but necessary. Keeping such buffers around could potentially * cause buffer exhaustion eventually (they can never be written * out, so will get constantly be re-dirtied). It also causes * all sorts of vfs panics. For non-recoverable write errors, * also invalidate the attrcache, so we'll be forced to go over * the wire for this object, returning an error to user on next * call (most of the time). */ if (error == EINTR || error == EIO || error == ETIMEDOUT || (!error && (bp->b_flags & B_NEEDCOMMIT))) { bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((error == EINTR || error == ETIMEDOUT) && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = np->n_error = error; mtx_lock(&np->n_mtx); np->n_flag |= NWRITEERR; np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); mtx_unlock(&np->n_mtx); } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; if (must_commit) ncl_clearcommit(vp->v_mount); bufdone(bp); return (error); } /* * Used to aid in handling ftruncate() operations on the NFS client side. * Truncation creates a number of special problems for NFS. We have to * throw away VM pages and buffer cache buffers that are beyond EOF, and * we have to properly handle VM pages or (potentially dirty) buffers * that straddle the truncation point. */ int ncl_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) { struct nfsnode *np = VTONFS(vp); u_quad_t tsize; int biosize = vp->v_bufobj.bo_bsize; int error = 0; mtx_lock(&np->n_mtx); tsize = np->n_size; np->n_size = nsize; mtx_unlock(&np->n_mtx); if (nsize < tsize) { struct buf *bp; daddr_t lbn; int bufsize; /* * vtruncbuf() doesn't get the buffer overlapping the * truncation point. We may have a B_DELWRI and/or B_CACHE * buffer that now needs to be truncated. */ error = vtruncbuf(vp, cred, nsize, biosize); lbn = nsize / biosize; bufsize = nsize - (lbn * biosize); bp = nfs_getcacheblk(vp, lbn, bufsize, td); if (!bp) return EINTR; if (bp->b_dirtyoff > bp->b_bcount) bp->b_dirtyoff = bp->b_bcount; if (bp->b_dirtyend > bp->b_bcount) bp->b_dirtyend = bp->b_bcount; bp->b_flags |= B_RELBUF; /* don't leave garbage around */ brelse(bp); } else { vnode_pager_setsize(vp, nsize); } return(error); } Index: head/sys/fs/smbfs/smbfs_io.c =================================================================== --- head/sys/fs/smbfs/smbfs_io.c (revision 317060) +++ head/sys/fs/smbfs/smbfs_io.c (revision 317061) @@ -1,676 +1,676 @@ /*- * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* #include */ #include #include #include #include #include #include /*#define SMBFS_RWGENERIC*/ extern int smbfs_pbuf_freecnt; static int smbfs_fastlookup = 1; SYSCTL_DECL(_vfs_smbfs); SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, ""); #define DE_SIZE (sizeof(struct dirent)) static int smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred) { struct dirent de; struct componentname cn; struct smb_cred *scred; struct smbfs_fctx *ctx; struct vnode *newvp; struct smbnode *np = VTOSMB(vp); int error/*, *eofflag = ap->a_eofflag*/; long offset, limit; np = VTOSMB(vp); SMBVDEBUG("dirname='%s'\n", np->n_name); scred = smbfs_malloc_scred(); smb_makescred(scred, uio->uio_td, cred); offset = uio->uio_offset / DE_SIZE; /* offset in the directory */ limit = uio->uio_resid / DE_SIZE; if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0) { error = EINVAL; goto out; } while (limit && offset < 2) { limit--; bzero((caddr_t)&de, DE_SIZE); de.d_reclen = DE_SIZE; de.d_fileno = (offset == 0) ? np->n_ino : (np->n_parent ? np->n_parentino : 2); if (de.d_fileno == 0) de.d_fileno = 0x7ffffffd + offset; de.d_namlen = offset + 1; de.d_name[0] = '.'; de.d_name[1] = '.'; de.d_name[offset + 1] = '\0'; de.d_type = DT_DIR; error = uiomove(&de, DE_SIZE, uio); if (error) goto out; offset++; uio->uio_offset += DE_SIZE; } if (limit == 0) { error = 0; goto out; } if (offset != np->n_dirofs || np->n_dirseq == NULL) { SMBVDEBUG("Reopening search %ld:%ld\n", offset, np->n_dirofs); if (np->n_dirseq) { smbfs_findclose(np->n_dirseq, scred); np->n_dirseq = NULL; } np->n_dirofs = 2; error = smbfs_findopen(np, "*", 1, SMB_FA_SYSTEM | SMB_FA_HIDDEN | SMB_FA_DIR, scred, &ctx); if (error) { SMBVDEBUG("can not open search, error = %d", error); goto out; } np->n_dirseq = ctx; } else ctx = np->n_dirseq; while (np->n_dirofs < offset) { error = smbfs_findnext(ctx, offset - np->n_dirofs++, scred); if (error) { smbfs_findclose(np->n_dirseq, scred); np->n_dirseq = NULL; error = ENOENT ? 0 : error; goto out; } } error = 0; for (; limit; limit--, offset++) { error = smbfs_findnext(ctx, limit, scred); if (error) break; np->n_dirofs++; bzero((caddr_t)&de, DE_SIZE); de.d_reclen = DE_SIZE; de.d_fileno = ctx->f_attr.fa_ino; de.d_type = (ctx->f_attr.fa_attr & SMB_FA_DIR) ? DT_DIR : DT_REG; de.d_namlen = ctx->f_nmlen; bcopy(ctx->f_name, de.d_name, de.d_namlen); de.d_name[de.d_namlen] = '\0'; if (smbfs_fastlookup) { error = smbfs_nget(vp->v_mount, vp, ctx->f_name, ctx->f_nmlen, &ctx->f_attr, &newvp); if (!error) { cn.cn_nameptr = de.d_name; cn.cn_namelen = de.d_namlen; cache_enter(vp, newvp, &cn); vput(newvp); } } error = uiomove(&de, DE_SIZE, uio); if (error) break; } if (error == ENOENT) error = 0; uio->uio_offset = offset * DE_SIZE; out: smbfs_free_scred(scred); return error; } int smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred) { struct smbmount *smp = VFSTOSMBFS(vp->v_mount); struct smbnode *np = VTOSMB(vp); struct thread *td; struct vattr vattr; struct smb_cred *scred; int error, lks; /* * Protect against method which is not supported for now */ if (uiop->uio_segflg == UIO_NOCOPY) return EOPNOTSUPP; if (vp->v_type != VREG && vp->v_type != VDIR) { SMBFSERR("vn types other than VREG or VDIR are unsupported !\n"); return EIO; } if (uiop->uio_resid == 0) return 0; if (uiop->uio_offset < 0) return EINVAL; /* if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize) return EFBIG;*/ td = uiop->uio_td; if (vp->v_type == VDIR) { lks = LK_EXCLUSIVE; /* lockstatus(vp->v_vnlock); */ if (lks == LK_SHARED) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = smbfs_readvdir(vp, uiop, cred); if (lks == LK_SHARED) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); return error; } /* biosize = SSTOCN(smp->sm_share)->sc_txmax;*/ if (np->n_flag & NMODIFIED) { smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, cred); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, cred); if (error) return error; if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) { error = smbfs_vinvalbuf(vp, td); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } } scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); error = smb_read(smp->sm_share, np->n_fid, uiop, scred); smbfs_free_scred(scred); return (error); } int smbfs_writevnode(struct vnode *vp, struct uio *uiop, struct ucred *cred, int ioflag) { struct smbmount *smp = VTOSMBFS(vp); struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; struct thread *td; int error = 0; if (vp->v_type != VREG) { SMBERROR("vn types other than VREG unsupported !\n"); return EIO; } SMBVDEBUG("ofs=%jd,resid=%zd\n", (intmax_t)uiop->uio_offset, uiop->uio_resid); if (uiop->uio_offset < 0) return EINVAL; /* if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize) return (EFBIG);*/ td = uiop->uio_td; if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { smbfs_attr_cacheremove(vp); error = smbfs_vinvalbuf(vp, td); if (error) return error; } if (ioflag & IO_APPEND) { #ifdef notyet /* * File size can be changed by another client */ smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, cred); if (error) return (error); #endif uiop->uio_offset = np->n_size; } } if (uiop->uio_resid == 0) return 0; if (vn_rlimit_fsize(vp, uiop, td)) return (EFBIG); scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); error = smb_write(smp->sm_share, np->n_fid, uiop, scred); smbfs_free_scred(scred); SMBVDEBUG("after: ofs=%jd,resid=%zd\n", (intmax_t)uiop->uio_offset, uiop->uio_resid); if (!error) { if (uiop->uio_offset > np->n_size) { np->n_size = uiop->uio_offset; vnode_pager_setsize(vp, np->n_size); } } return error; } /* * Do an I/O operation to/from a cache block. */ int smbfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) { struct smbmount *smp = VFSTOSMBFS(vp->v_mount); struct smbnode *np = VTOSMB(vp); struct uio *uiop; struct iovec io; struct smb_cred *scred; int error = 0; uiop = malloc(sizeof(struct uio), M_SMBFSDATA, M_WAITOK); uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = td; scred = smbfs_malloc_scred(); smb_makescred(scred, td, cr); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; error = smb_read(smp->sm_share, np->n_fid, uiop, scred); if (error) break; if (uiop->uio_resid) { int left = uiop->uio_resid; int nread = bp->b_bcount - left; if (left > 0) bzero((char *)bp->b_data + nread, left); } break; default: printf("smbfs_doio: type %x unexpected\n",vp->v_type); break; } if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } } else { /* write */ if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; error = smb_write(smp->sm_share, np->n_fid, uiop, scred); /* * For an interrupted write, the buffer is still valid * and the write hasn't been pushed to the server yet, * so we can't set BIO_ERROR and report the interruption * by setting B_EINTR. For the B_ASYNC case, B_EINTR * is not relevant, so the rpc attempt is essentially * a noop. For the case of a V3 write rpc not being * committed to stable storage, the block is still * dirty and requires either a commit rpc or another * write rpc with iomode == NFSV3WRITE_FILESYNC before * the block is reused. This is indicated by setting * the B_DELWRI and B_NEEDCOMMIT flags. */ if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { int s; s = splbio(); bp->b_flags &= ~(B_INVAL|B_NOCACHE); if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; splx(s); } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); free(uiop, M_SMBFSDATA); smbfs_free_scred(scred); return 0; } } bp->b_resid = uiop->uio_resid; bufdone(bp); free(uiop, M_SMBFSDATA); smbfs_free_scred(scred); return error; } /* * Vnode op for VM getpages. * Wish wish .... get rid from multiple IO routines */ int smbfs_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; } */ *ap; { #ifdef SMBFS_RWGENERIC return vop_stdgetpages(ap); #else int i, error, nextoff, size, toff, npages, count; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; struct smbmount *smp; struct smbnode *np; struct smb_cred *scred; vm_object_t object; vm_page_t *pages; vp = ap->a_vp; if ((object = vp->v_object) == NULL) { printf("smbfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ np = VTOSMB(vp); smp = VFSTOSMBFS(vp->v_mount); pages = ap->a_m; npages = ap->a_count; /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. * * XXXGL: is that true for SMB filesystem? */ VM_OBJECT_WLOCK(object); if (pages[npages - 1]->valid != 0 && --npages == 0) goto out; VM_OBJECT_WUNLOCK(object); scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); bp = getpbuf(&smbfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, npages); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, npages); count = npages << PAGE_SHIFT; iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = smb_read(smp->sm_share, np->n_fid, &uio, scred); smbfs_free_scred(scred); pmap_qremove(kva, npages); relpbuf(bp, &smbfs_pbuf_freecnt); if (error && (uio.uio_resid == count)) { printf("smbfs_getpages: error %d\n",error); return VM_PAGER_ERROR; } size = count - uio.uio_resid; VM_OBJECT_WLOCK(object); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("smbfs_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("smbfs_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error occurred * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } } out: VM_OBJECT_WUNLOCK(object); if (ap->a_rbehind) *ap->a_rbehind = 0; if (ap->a_rahead) *ap->a_rahead = 0; return (VM_PAGER_OK); #endif /* SMBFS_RWGENERIC */ } /* * Vnode op for VM putpages. * possible bug: all IO done in sync mode * Note that vop_close always invalidate pages before close, so it's * not necessary to open vnode. */ int smbfs_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { int error; struct vnode *vp = ap->a_vp; struct thread *td; struct ucred *cred; #ifdef SMBFS_RWGENERIC td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ VOP_OPEN(vp, FWRITE, cred, td, NULL); error = vop_stdputpages(ap); VOP_CLOSE(vp, FWRITE, cred, td); return error; #else struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int i, npages, count; int *rtvals; struct smbmount *smp; struct smbnode *np; struct smb_cred *scred; vm_page_t *pages; td = curthread; /* XXX */ cred = td->td_ucred; /* XXX */ /* VOP_OPEN(vp, FWRITE, cred, td, NULL);*/ np = VTOSMB(vp); smp = VFSTOSMBFS(vp->v_mount); pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); for (i = 0; i < npages; i++) { rtvals[i] = VM_PAGER_ERROR; } bp = getpbuf(&smbfs_pbuf_freecnt); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, count); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, count); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; SMBVDEBUG("ofs=%jd,resid=%zd\n", (intmax_t)uio.uio_offset, uio.uio_resid); scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); error = smb_write(smp->sm_share, np->n_fid, &uio, scred); smbfs_free_scred(scred); /* VOP_CLOSE(vp, FWRITE, cred, td);*/ SMBVDEBUG("paged write done: %d\n", error); pmap_qremove(kva, npages); relpbuf(bp, &smbfs_pbuf_freecnt); if (!error) vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid); return rtvals[0]; #endif /* SMBFS_RWGENERIC */ } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int smbfs_vinvalbuf(struct vnode *vp, struct thread *td) { struct smbnode *np = VTOSMB(vp); int error = 0; if (vp->v_iflag & VI_DOOMED) return 0; while (np->n_flag & NFLUSHINPROG) { np->n_flag |= NFLUSHWANT; error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz); error = smb_td_intr(td); if (error == EINTR) return EINTR; } np->n_flag |= NFLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup(&np->n_flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); } np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; wakeup(&np->n_flag); } return (error); } Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 317060) +++ head/sys/i386/i386/trap.c (revision 317061) @@ -1,1124 +1,1124 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #ifdef KDTRACE_HOOKS #include #endif extern void trap(struct trapframe *frame); extern void syscall(struct trapframe *frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif static int prot_fault_translation = 0; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A non-zero return value from the * hook means that the NMI was consumed by it and that we can * return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock. */ if (type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and conditionally * reenable interrupts. If we hold a spin lock, then * we must not reenable interrupts. This might be a * spurious page fault. */ eva = rcr2(); if (td->td_md.md_spinlock_count == 0) enable_intr(); } if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif user_trctrap_out: frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = npxtrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto user_trctrap_out; } if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = npxtrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); if (npxdna()) goto out; break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto kernel_trctrap; } if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ kernel_trctrap: if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(curpcb->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); goto out; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x esp 0x%08x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_esp, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return (-2); #endif if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a user-space address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; #if defined(PAE) || defined(PAE_TABLES) else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; #endif else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", #if defined(PAE) || defined(PAE_TABLES) pg_nx != 0 ? (code & PGEX_I ? " instruction" : " data") : #endif "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if (TF_HAS_STACKREGS(frame)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) { frame->tf_err = eva; /* smuggle fault address to ddb */ if (kdb_trap(type, 0, frame)) { frame->tf_err = code; /* restore error code */ return; } frame->tf_err = code; /* restore error code */ } #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; caddr_t params; long tmp; int error; p = td->td_proc; frame = td->td_frame; params = (caddr_t)frame->tf_esp + sizeof(int); sa->code = frame->tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (params != NULL && sa->narg != 0) error = copyin(params, (caddr_t)sa->args, (u_int)(sa->narg * sizeof(int))); else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; } return (error); } #include "../../kern/subr_syscall.c" /* * syscall - system call request C handler. A system call is * essentially treated as a trap by reusing the frame layout. */ void syscall(struct trapframe *frame) { struct thread *td; struct syscall_args sa; register_t orig_tf_eflags; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!(TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { panic("syscall"); /* NOT REACHED */ } #endif orig_tf_eflags = frame->tf_eflags; td = curthread; td->td_frame = frame; error = syscallenter(td, &sa); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame->tf_eflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); syscallret(td, error, &sa); } Index: head/sys/i386/include/atomic.h =================================================================== --- head/sys/i386/include/atomic.h (revision 317060) +++ head/sys/i386/include/atomic.h (revision 317061) @@ -1,854 +1,854 @@ /*- * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_ATOMIC_H_ #define _MACHINE_ATOMIC_H_ #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif #ifdef _KERNEL #include #include #endif #ifndef __OFFSETOF_MONITORBUF /* * __OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). * * The open-coded number is used instead of the symbolic expression to * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. * An assertion in i386/vm_machdep.c ensures that the value is correct. */ -#define __OFFSETOF_MONITORBUF 0x180 +#define __OFFSETOF_MONITORBUF 0x80 static __inline void __mbk(void) { __asm __volatile("lock; addl $0,%%fs:%0" : "+m" (*(u_int *)__OFFSETOF_MONITORBUF) : : "memory", "cc"); } static __inline void __mbu(void) { __asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc"); } #endif /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. * * atomic_set_char(P, V) (*(u_char *)(P) |= (V)) * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V)) * atomic_add_char(P, V) (*(u_char *)(P) += (V)) * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V)) * * atomic_set_short(P, V) (*(u_short *)(P) |= (V)) * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V)) * atomic_add_short(P, V) (*(u_short *)(P) += (V)) * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V)) * * atomic_set_int(P, V) (*(u_int *)(P) |= (V)) * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V)) * atomic_add_int(P, V) (*(u_int *)(P) += (V)) * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V)) * atomic_swap_int(P, V) (return (*(u_int *)(P)); *(u_int *)(P) = (V);) * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;) * * atomic_set_long(P, V) (*(u_long *)(P) |= (V)) * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V)) * atomic_add_long(P, V) (*(u_long *)(P) += (V)) * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V)) * atomic_swap_long(P, V) (return (*(u_long *)(P)); *(u_long *)(P) = (V);) * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;) */ /* * The above functions are expanded inline in the statically-linked * kernel. Lock prefixes are generated if an SMP kernel is being * built. * * Kernel modules call real functions which are built into the kernel. * This allows kernel modules to be portable between UP and SMP systems. */ #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM) #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \ void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_char(volatile u_char *dst, u_char expect, u_char src); int atomic_cmpset_short(volatile u_short *dst, u_short expect, u_short src); int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src); int atomic_fcmpset_char(volatile u_char *dst, u_char *expect, u_char src); int atomic_fcmpset_short(volatile u_short *dst, u_short *expect, u_short src); int atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src); u_int atomic_fetchadd_int(volatile u_int *p, u_int v); int atomic_testandset_int(volatile u_int *p, u_int v); int atomic_testandclear_int(volatile u_int *p, u_int v); void atomic_thread_fence_acq(void); void atomic_thread_fence_acq_rel(void); void atomic_thread_fence_rel(void); void atomic_thread_fence_seq_cst(void); #define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_64(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64(volatile uint64_t *); void atomic_store_rel_64(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); #else /* !KLD_MODULE && __GNUCLIKE_ASM */ /* * For userland, always use lock prefixes so that the binaries will run * on both SMP and !SMP systems. */ #if defined(SMP) || !defined(_KERNEL) #define MPLOCKED "lock ; " #else #define MPLOCKED #endif /* * The assembly is volatilized to avoid code chunk removal by the compiler. * GCC aggressively reorders operations and memory clobbering is necessary * in order to avoid that for memory barriers. */ #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ static __inline void \ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "cc"); \ } \ \ static __inline void \ atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "memory", "cc"); \ } \ struct __hack /* * Atomic compare and set, used by the mutex functions. * * cmpset: * if (*dst == expect) * *dst = src * * fcmpset: * if (*dst == *expect) * *dst = src * else * *expect = *dst * * Returns 0 on failure, non-zero on success. */ #define ATOMIC_CMPSET(TYPE, CONS) \ static __inline int \ atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_cmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (expect) /* 2 */ \ : CONS (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } \ \ static __inline int \ atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \ { \ u_char res; \ \ __asm __volatile( \ " " MPLOCKED " " \ " cmpxchg %3,%1 ; " \ " sete %0 ; " \ "# atomic_fcmpset_" #TYPE " " \ : "=q" (res), /* 0 */ \ "+m" (*dst), /* 1 */ \ "+a" (*expect) /* 2 */ \ : CONS (src) /* 3 */ \ : "memory", "cc"); \ return (res); \ } ATOMIC_CMPSET(char, "q"); ATOMIC_CMPSET(short, "r"); ATOMIC_CMPSET(int, "r"); /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. */ static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int v) { __asm __volatile( " " MPLOCKED " " " xaddl %0,%1 ; " "# atomic_fetchadd_int" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } static __inline int atomic_testandset_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btsl %2,%1 ; " " setc %0 ; " "# atomic_testandset_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandclear_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btrl %2,%1 ; " " setc %0 ; " "# atomic_testandclear_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } /* * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * * However, a load may pass a store if they are performed on distinct * addresses, so we need Store/Load barrier for sequentially * consistent fences in SMP kernels. We use "lock addl $0,mem" for a * Store/Load barrier, as recommended by the AMD Software Optimization * Guide, and not mfence. In the kernel, we use a private per-cpu * cache line for "mem", to avoid introducing false data * dependencies. In user space, we use the word at the top of the * stack. * * For UP kernels, however, the memory of the single processor is * always consistent, so we only need to stop the compiler from * reordering accesses in a way that violates the semantics of acquire * and release. */ #if defined(_KERNEL) #if defined(SMP) #define __storeload_barrier() __mbk() #else /* _KERNEL && UP */ #define __storeload_barrier() __compiler_membar() #endif /* SMP */ #else /* !_KERNEL */ #define __storeload_barrier() __mbu() #endif /* _KERNEL*/ #define ATOMIC_LOAD(TYPE) \ static __inline u_##TYPE \ atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ { \ u_##TYPE res; \ \ res = *p; \ __compiler_membar(); \ return (res); \ } \ struct __hack #define ATOMIC_STORE(TYPE) \ static __inline void \ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ { \ \ __compiler_membar(); \ *p = v; \ } \ struct __hack static __inline void atomic_thread_fence_acq(void) { __compiler_membar(); } static __inline void atomic_thread_fence_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_acq_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_seq_cst(void) { __storeload_barrier(); } #ifdef _KERNEL #ifdef WANT_FUNCTIONS int atomic_cmpset_64_i386(volatile uint64_t *, uint64_t, uint64_t); int atomic_cmpset_64_i586(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64_i386(volatile uint64_t *); uint64_t atomic_load_acq_64_i586(volatile uint64_t *); void atomic_store_rel_64_i386(volatile uint64_t *, uint64_t); void atomic_store_rel_64_i586(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i386(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i586(volatile uint64_t *, uint64_t); #endif /* I486 does not support SMP or CMPXCHG8B. */ static __inline int atomic_cmpset_64_i386(volatile uint64_t *dst, uint64_t expect, uint64_t src) { volatile uint32_t *p; u_char res; p = (volatile uint32_t *)dst; __asm __volatile( " pushfl ; " " cli ; " " xorl %1,%%eax ; " " xorl %2,%%edx ; " " orl %%edx,%%eax ; " " jne 1f ; " " movl %4,%1 ; " " movl %5,%2 ; " "1: " " sete %3 ; " " popfl" : "+A" (expect), /* 0 */ "+m" (*p), /* 1 */ "+m" (*(p + 1)), /* 2 */ "=q" (res) /* 3 */ : "r" ((uint32_t)src), /* 4 */ "r" ((uint32_t)(src >> 32)) /* 5 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i386(volatile uint64_t *p) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " popfl" : "=&A" (res) /* 0 */ : "m" (*q), /* 1 */ "m" (*(q + 1)) /* 2 */ : "memory"); return (res); } static __inline void atomic_store_rel_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %%eax,%0 ; " " movl %%edx,%1 ; " " popfl" : "=m" (*q), /* 0 */ "=m" (*(q + 1)) /* 1 */ : "A" (v) /* 2 */ : "memory"); } static __inline uint64_t atomic_swap_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " movl %4,%2 ; " " movl %3,%1 ; " " popfl" : "=&A" (res), /* 0 */ "+m" (*q), /* 1 */ "+m" (*(q + 1)) /* 2 */ : "r" ((uint32_t)v), /* 3 */ "r" ((uint32_t)(v >> 32))); /* 4 */ return (res); } static __inline int atomic_cmpset_64_i586(volatile uint64_t *dst, uint64_t expect, uint64_t src) { u_char res; __asm __volatile( " " MPLOCKED " " " cmpxchg8b %1 ; " " sete %0" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+A" (expect) /* 2 */ : "b" ((uint32_t)src), /* 3 */ "c" ((uint32_t)(src >> 32)) /* 4 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i586(volatile uint64_t *p) { uint64_t res; __asm __volatile( " movl %%ebx,%%eax ; " " movl %%ecx,%%edx ; " " " MPLOCKED " " " cmpxchg8b %1" : "=&A" (res), /* 0 */ "+m" (*p) /* 1 */ : : "memory", "cc"); return (res); } static __inline void atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); } static __inline uint64_t atomic_swap_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); return (v); } static __inline int atomic_cmpset_64(volatile uint64_t *dst, uint64_t expect, uint64_t src) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_cmpset_64_i386(dst, expect, src)); else return (atomic_cmpset_64_i586(dst, expect, src)); } static __inline uint64_t atomic_load_acq_64(volatile uint64_t *p) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_load_acq_64_i386(p)); else return (atomic_load_acq_64_i586(p)); } static __inline void atomic_store_rel_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) atomic_store_rel_64_i386(p, v); else atomic_store_rel_64_i586(p, v); } static __inline uint64_t atomic_swap_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_swap_64_i386(p, v)); else return (atomic_swap_64_i586(p, v)); } #endif /* _KERNEL */ #endif /* KLD_MODULE || !__GNUCLIKE_ASM */ ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v); ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v); ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v); ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v); ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v); ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v); ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v); ATOMIC_ASM(set, int, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, int, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v); ATOMIC_ASM(set, long, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, long, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, long, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subl %1,%0", "ir", v); #define ATOMIC_LOADSTORE(TYPE) \ ATOMIC_LOAD(TYPE); \ ATOMIC_STORE(TYPE) ATOMIC_LOADSTORE(char); ATOMIC_LOADSTORE(short); ATOMIC_LOADSTORE(int); ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE #undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS static __inline int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) { return (atomic_cmpset_int((volatile u_int *)dst, (u_int)expect, (u_int)src)); } static __inline u_long atomic_fetchadd_long(volatile u_long *p, u_long v) { return (atomic_fetchadd_int((volatile u_int *)p, (u_int)v)); } static __inline int atomic_testandset_long(volatile u_long *p, u_int v) { return (atomic_testandset_int((volatile u_int *)p, v)); } static __inline int atomic_testandclear_long(volatile u_long *p, u_int v) { return (atomic_testandclear_int((volatile u_int *)p, v)); } /* Read the current value and store a new value in the destination. */ #ifdef __GNUCLIKE_ASM static __inline u_int atomic_swap_int(volatile u_int *p, u_int v) { __asm __volatile( " xchgl %1,%0 ; " "# atomic_swap_int" : "+r" (v), /* 0 */ "+m" (*p)); /* 1 */ return (v); } static __inline u_long atomic_swap_long(volatile u_long *p, u_long v) { return (atomic_swap_int((volatile u_int *)p, (u_int)v)); } #else /* !__GNUCLIKE_ASM */ u_int atomic_swap_int(volatile u_int *p, u_int v); u_long atomic_swap_long(volatile u_long *p, u_long v); #endif /* __GNUCLIKE_ASM */ #define atomic_set_acq_char atomic_set_barr_char #define atomic_set_rel_char atomic_set_barr_char #define atomic_clear_acq_char atomic_clear_barr_char #define atomic_clear_rel_char atomic_clear_barr_char #define atomic_add_acq_char atomic_add_barr_char #define atomic_add_rel_char atomic_add_barr_char #define atomic_subtract_acq_char atomic_subtract_barr_char #define atomic_subtract_rel_char atomic_subtract_barr_char #define atomic_cmpset_acq_char atomic_cmpset_char #define atomic_cmpset_rel_char atomic_cmpset_char #define atomic_fcmpset_acq_char atomic_fcmpset_char #define atomic_fcmpset_rel_char atomic_fcmpset_char #define atomic_set_acq_short atomic_set_barr_short #define atomic_set_rel_short atomic_set_barr_short #define atomic_clear_acq_short atomic_clear_barr_short #define atomic_clear_rel_short atomic_clear_barr_short #define atomic_add_acq_short atomic_add_barr_short #define atomic_add_rel_short atomic_add_barr_short #define atomic_subtract_acq_short atomic_subtract_barr_short #define atomic_subtract_rel_short atomic_subtract_barr_short #define atomic_cmpset_acq_short atomic_cmpset_short #define atomic_cmpset_rel_short atomic_cmpset_short #define atomic_fcmpset_acq_short atomic_fcmpset_short #define atomic_fcmpset_rel_short atomic_fcmpset_short #define atomic_set_acq_int atomic_set_barr_int #define atomic_set_rel_int atomic_set_barr_int #define atomic_clear_acq_int atomic_clear_barr_int #define atomic_clear_rel_int atomic_clear_barr_int #define atomic_add_acq_int atomic_add_barr_int #define atomic_add_rel_int atomic_add_barr_int #define atomic_subtract_acq_int atomic_subtract_barr_int #define atomic_subtract_rel_int atomic_subtract_barr_int #define atomic_cmpset_acq_int atomic_cmpset_int #define atomic_cmpset_rel_int atomic_cmpset_int #define atomic_fcmpset_acq_int atomic_fcmpset_int #define atomic_fcmpset_rel_int atomic_fcmpset_int #define atomic_set_acq_long atomic_set_barr_long #define atomic_set_rel_long atomic_set_barr_long #define atomic_clear_acq_long atomic_clear_barr_long #define atomic_clear_rel_long atomic_clear_barr_long #define atomic_add_acq_long atomic_add_barr_long #define atomic_add_rel_long atomic_add_barr_long #define atomic_subtract_acq_long atomic_subtract_barr_long #define atomic_subtract_rel_long atomic_subtract_barr_long #define atomic_cmpset_acq_long atomic_cmpset_long #define atomic_cmpset_rel_long atomic_cmpset_long #define atomic_fcmpset_acq_long atomic_fcmpset_long #define atomic_fcmpset_rel_long atomic_fcmpset_long #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) /* Operations on 8-bit bytes. */ #define atomic_set_8 atomic_set_char #define atomic_set_acq_8 atomic_set_acq_char #define atomic_set_rel_8 atomic_set_rel_char #define atomic_clear_8 atomic_clear_char #define atomic_clear_acq_8 atomic_clear_acq_char #define atomic_clear_rel_8 atomic_clear_rel_char #define atomic_add_8 atomic_add_char #define atomic_add_acq_8 atomic_add_acq_char #define atomic_add_rel_8 atomic_add_rel_char #define atomic_subtract_8 atomic_subtract_char #define atomic_subtract_acq_8 atomic_subtract_acq_char #define atomic_subtract_rel_8 atomic_subtract_rel_char #define atomic_load_acq_8 atomic_load_acq_char #define atomic_store_rel_8 atomic_store_rel_char #define atomic_cmpset_8 atomic_cmpset_char #define atomic_cmpset_acq_8 atomic_cmpset_acq_char #define atomic_cmpset_rel_8 atomic_cmpset_rel_char #define atomic_fcmpset_8 atomic_fcmpset_char #define atomic_fcmpset_acq_8 atomic_fcmpset_acq_char #define atomic_fcmpset_rel_8 atomic_fcmpset_rel_char /* Operations on 16-bit words. */ #define atomic_set_16 atomic_set_short #define atomic_set_acq_16 atomic_set_acq_short #define atomic_set_rel_16 atomic_set_rel_short #define atomic_clear_16 atomic_clear_short #define atomic_clear_acq_16 atomic_clear_acq_short #define atomic_clear_rel_16 atomic_clear_rel_short #define atomic_add_16 atomic_add_short #define atomic_add_acq_16 atomic_add_acq_short #define atomic_add_rel_16 atomic_add_rel_short #define atomic_subtract_16 atomic_subtract_short #define atomic_subtract_acq_16 atomic_subtract_acq_short #define atomic_subtract_rel_16 atomic_subtract_rel_short #define atomic_load_acq_16 atomic_load_acq_short #define atomic_store_rel_16 atomic_store_rel_short #define atomic_cmpset_16 atomic_cmpset_short #define atomic_cmpset_acq_16 atomic_cmpset_acq_short #define atomic_cmpset_rel_16 atomic_cmpset_rel_short #define atomic_fcmpset_16 atomic_fcmpset_short #define atomic_fcmpset_acq_16 atomic_fcmpset_acq_short #define atomic_fcmpset_rel_16 atomic_fcmpset_rel_short /* Operations on 32-bit double words. */ #define atomic_set_32 atomic_set_int #define atomic_set_acq_32 atomic_set_acq_int #define atomic_set_rel_32 atomic_set_rel_int #define atomic_clear_32 atomic_clear_int #define atomic_clear_acq_32 atomic_clear_acq_int #define atomic_clear_rel_32 atomic_clear_rel_int #define atomic_add_32 atomic_add_int #define atomic_add_acq_32 atomic_add_acq_int #define atomic_add_rel_32 atomic_add_rel_int #define atomic_subtract_32 atomic_subtract_int #define atomic_subtract_acq_32 atomic_subtract_acq_int #define atomic_subtract_rel_32 atomic_subtract_rel_int #define atomic_load_acq_32 atomic_load_acq_int #define atomic_store_rel_32 atomic_store_rel_int #define atomic_cmpset_32 atomic_cmpset_int #define atomic_cmpset_acq_32 atomic_cmpset_acq_int #define atomic_cmpset_rel_32 atomic_cmpset_rel_int #define atomic_fcmpset_32 atomic_fcmpset_int #define atomic_fcmpset_acq_32 atomic_fcmpset_acq_int #define atomic_fcmpset_rel_32 atomic_fcmpset_rel_int #define atomic_swap_32 atomic_swap_int #define atomic_readandclear_32 atomic_readandclear_int #define atomic_fetchadd_32 atomic_fetchadd_int #define atomic_testandset_32 atomic_testandset_int #define atomic_testandclear_32 atomic_testandclear_int /* Operations on pointers. */ #define atomic_set_ptr(p, v) \ atomic_set_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_acq_ptr(p, v) \ atomic_set_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_rel_ptr(p, v) \ atomic_set_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_ptr(p, v) \ atomic_clear_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_acq_ptr(p, v) \ atomic_clear_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_rel_ptr(p, v) \ atomic_clear_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_ptr(p, v) \ atomic_add_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_acq_ptr(p, v) \ atomic_add_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_rel_ptr(p, v) \ atomic_add_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_ptr(p, v) \ atomic_subtract_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_acq_ptr(p, v) \ atomic_subtract_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_rel_ptr(p, v) \ atomic_subtract_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_load_acq_ptr(p) \ atomic_load_acq_int((volatile u_int *)(p)) #define atomic_store_rel_ptr(p, v) \ atomic_store_rel_int((volatile u_int *)(p), (v)) #define atomic_cmpset_ptr(dst, old, new) \ atomic_cmpset_int((volatile u_int *)(dst), (u_int)(old), (u_int)(new)) #define atomic_cmpset_acq_ptr(dst, old, new) \ atomic_cmpset_acq_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_cmpset_rel_ptr(dst, old, new) \ atomic_cmpset_rel_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_fcmpset_ptr(dst, old, new) \ atomic_fcmpset_int((volatile u_int *)(dst), (u_int *)(old), (u_int)(new)) #define atomic_fcmpset_acq_ptr(dst, old, new) \ atomic_fcmpset_acq_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_fcmpset_rel_ptr(dst, old, new) \ atomic_fcmpset_rel_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_swap_ptr(p, v) \ atomic_swap_int((volatile u_int *)(p), (u_int)(v)) #define atomic_readandclear_ptr(p) \ atomic_readandclear_int((volatile u_int *)(p)) #endif /* !WANT_FUNCTIONS */ #if defined(_KERNEL) #define mb() __mbk() #define wmb() __mbk() #define rmb() __mbk() #else #define mb() __mbu() #define wmb() __mbu() #define rmb() __mbu() #endif #endif /* !_MACHINE_ATOMIC_H_ */ Index: head/sys/i386/include/counter.h =================================================================== --- head/sys/i386/include/counter.h (revision 317060) +++ head/sys/i386/include/counter.h (revision 317061) @@ -1,178 +1,180 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif #include #include +extern struct pcpu __pcpu[]; + +#define EARLY_COUNTER &__pcpu[0].pc_early_dummy_counter + #define counter_enter() do { \ if ((cpu_feature & CPUID_CX8) == 0) \ critical_enter(); \ } while (0) #define counter_exit() do { \ if ((cpu_feature & CPUID_CX8) == 0) \ critical_exit(); \ } while (0) - -extern struct pcpu __pcpu[MAXCPU]; static inline void counter_64_inc_8b(uint64_t *p, int64_t inc) { __asm __volatile( "movl %%fs:(%%esi),%%eax\n\t" "movl %%fs:4(%%esi),%%edx\n" "1:\n\t" "movl %%eax,%%ebx\n\t" "movl %%edx,%%ecx\n\t" "addl (%%edi),%%ebx\n\t" "adcl 4(%%edi),%%ecx\n\t" "cmpxchg8b %%fs:(%%esi)\n\t" "jnz 1b" : : "S" ((char *)p - (char *)&__pcpu[0]), "D" (&inc) : "memory", "cc", "eax", "edx", "ebx", "ecx"); } #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one_8b(uint64_t *p) { uint32_t res_lo, res_high; __asm __volatile( "movl %%eax,%%ebx\n\t" "movl %%edx,%%ecx\n\t" "cmpxchg8b (%2)" : "=a" (res_lo), "=d"(res_high) : "SD" (p) : "cc", "ebx", "ecx"); return (res_lo + ((uint64_t)res_high << 32)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t res; int i; res = 0; if ((cpu_feature & CPUID_CX8) == 0) { /* * The machines without cmpxchg8b are not SMP. * Disabling the preemption provides atomicity of the * counter reading, since update is done in the * critical section as well. */ critical_enter(); CPU_FOREACH(i) { res += *(uint64_t *)((char *)p + sizeof(struct pcpu) * i); } critical_exit(); } else { CPU_FOREACH(i) res += counter_u64_read_one_8b((uint64_t *)((char *)p + sizeof(struct pcpu) * i)); } return (res); } static inline void counter_u64_zero_one_8b(uint64_t *p) { __asm __volatile( "movl (%0),%%eax\n\t" "movl 4(%0),%%edx\n" "xorl %%ebx,%%ebx\n\t" "xorl %%ecx,%%ecx\n\t" "1:\n\t" "cmpxchg8b (%0)\n\t" "jnz 1b" : : "SD" (p) : "memory", "cc", "eax", "edx", "ebx", "ecx"); } static void counter_u64_zero_one_cpu(void *arg) { uint64_t *p; p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)); counter_u64_zero_one_8b(p); } static inline void counter_u64_zero_inline(counter_u64_t c) { int i; if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); CPU_FOREACH(i) *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0; critical_exit(); } else { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } } #endif #define counter_u64_add_protected(c, inc) do { \ if ((cpu_feature & CPUID_CX8) == 0) { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } else \ counter_64_inc_8b((c), (inc)); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); *(uint64_t *)zpcpu_get(c) += inc; critical_exit(); } else { counter_64_inc_8b(c, inc); } } #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/i386/include/pcpu.h =================================================================== --- head/sys/i386/include/pcpu.h (revision 317060) +++ head/sys/i386/include/pcpu.h (revision 317061) @@ -1,253 +1,253 @@ /*- * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifndef _SYS_CDEFS_H_ #error "sys/cdefs.h is a prerequisite for this file" #endif #include #include #include #include /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ #define PCPU_MD_FIELDS \ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct i386tss pc_common_tss; \ struct segment_descriptor pc_common_tssd; \ struct segment_descriptor *pc_tss_gdt; \ struct segment_descriptor *pc_fsgs_gdt; \ int pc_currentldt; \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_apic_id; \ int pc_private_tss; /* Flag indicating private tss*/\ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ struct mtx pc_cmap_lock; \ void *pc_cmap_pte1; \ void *pc_cmap_pte2; \ caddr_t pc_cmap_addr1; \ caddr_t pc_cmap_addr2; \ vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ - char __pad[189] + char __pad[445] #ifdef _KERNEL #ifdef lint extern struct pcpu *pcpup; #define get_pcpu() (pcpup) #define PCPU_GET(member) (pcpup->pc_ ## member) #define PCPU_ADD(member, val) (pcpup->pc_ ## member += (val)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&pcpup->pc_ ## member) #define PCPU_SET(member, val) (pcpup->pc_ ## member = (val)) #elif defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* * Evaluates to the byte offset of the per-cpu variable name. */ #define __pcpu_offset(name) \ __offsetof(struct pcpu, name) /* * Evaluates to the type of the per-cpu variable name. */ #define __pcpu_type(name) \ __typeof(((struct pcpu *)0)->name) /* * Evaluates to the address of the per-cpu variable name. */ #define __PCPU_PTR(name) __extension__ ({ \ __pcpu_type(name) *__p; \ \ __asm __volatile("movl %%fs:%1,%0; addl %2,%0" \ : "=r" (__p) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ "i" (__pcpu_offset(name))); \ \ __p; \ }) /* * Evaluates to the value of the per-cpu variable name. */ #define __PCPU_GET(name) __extension__ ({ \ __pcpu_type(name) __res; \ struct __s { \ u_char __b[MIN(sizeof(__res), 4)]; \ } __s; \ \ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \ sizeof(__res) == 4) { \ __asm __volatile("mov %%fs:%1,%0" \ : "=r" (__s) \ : "m" (*(struct __s *)(__pcpu_offset(name)))); \ *(struct __s *)(void *)&__res = __s; \ } else { \ __res = *__PCPU_PTR(name); \ } \ __res; \ }) /* * Adds a value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_ADD(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__val), 4)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("add %1,%%fs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else \ *__PCPU_PTR(name) += __val; \ } while (0) /* * Increments the value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_INC(name) do { \ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \ sizeof(__pcpu_type(name)) == 2 || \ sizeof(__pcpu_type(name)) == 4); \ if (sizeof(__pcpu_type(name)) == 1) { \ __asm __volatile("incb %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 2) { \ __asm __volatile("incw %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 4) { \ __asm __volatile("incl %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } \ } while (0) /* * Sets the value of the per-cpu variable name to value val. */ #define __PCPU_SET(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__val), 4)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("mov %1,%%fs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else { \ *__PCPU_PTR(name) = __val; \ } \ } while (0) #define get_pcpu() __extension__ ({ \ struct pcpu *__pc; \ \ __asm __volatile("movl %%fs:%1,%0" \ : "=r" (__pc) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace)))); \ __pc; \ }) #define PCPU_GET(member) __PCPU_GET(pc_ ## member) #define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val) #define PCPU_INC(member) __PCPU_INC(pc_ ## member) #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) #define OFFSETOF_CURTHREAD 0 #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnull-dereference" #endif static __inline __pure2 struct thread * __curthread(void) { struct thread *td; __asm("movl %%fs:%1,%0" : "=r" (td) : "m" (*(char *)OFFSETOF_CURTHREAD)); return (td); } #ifdef __clang__ #pragma clang diagnostic pop #endif #define curthread (__curthread()) #define OFFSETOF_CURPCB 16 static __inline __pure2 struct pcb * __curpcb(void) { struct pcb *pcb; __asm("movl %%fs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB)); return (pcb); } #define curpcb (__curpcb()) #else /* !lint || defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */ #error "this file needs to be ported to your compiler" #endif /* lint, etc. */ #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 317060) +++ head/sys/kern/kern_fork.c (revision 317061) @@ -1,1116 +1,1116 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(struct thread *td, struct pdfork_args *uap) { struct fork_req fr; int error, fd, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPROCDESC; fr.fr_pidp = &pid; fr.fr_pd_fd = &fd; fr.fr_pd_flags = uap->flags; /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct fork_req fr; int error, pid; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); fr.fr_flags = uap->flags; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > pid_max - 100) /* out of range */ pid = pid_max - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); static int fork_findpid(int flags) { struct proc *p; int trypid; static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ sx_assert(&allproc_lock, SX_LOCKED); sx_assert(&proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= pid_max) { trypid = trypid % pid_max; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. * * Avoid reuse of the process group id, session id or * the reaper subtree id. Note that for process group * and sessions, the amount of reserved pids is * limited by process limit. For the subtree ids, the * id is kept reserved only while there is a * non-reaped process in the subtree, so amount of * reserved pids is limited by process limit times * two. */ p = LIST_FIRST(&allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || p->p_reapsubtree == trypid || (p->p_pgrp != NULL && (p->p_pgrp->pg_id == trypid || (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p->p_pid > trypid && pidchecked > p->p_pid) pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && pidchecked > p->p_pgrp->pg_id) pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && pidchecked > p->p_session->s_sid) pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; return (trypid); } static int fork_norfproc(struct thread *td, int flags) { int error; struct proc *p1; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error) goto fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd, false); fdescfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(td); fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; int trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p1 = td->td_proc; trypid = fork_findpid(fr->fr_flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { fd = fdinit(p1->p_fd, false); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC | P2_TRAPCAP); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); /* * Whilst the proc lock is held, copy the VM domain data out * using the VM domain method. */ vm_domain_policy_init(&p2->p_vm_dom_policy); vm_domain_policy_localcopy(&p2->p_vm_dom_policy, &p1->p_vm_dom_policy); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vrefact(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { - PCPU_INC(cnt.v_forks); - PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize + + VM_CNT_INC(v_forks); + VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { - PCPU_INC(cnt.v_vforks); - PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize + + VM_CNT_INC(v_vforks); + VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { - PCPU_INC(cnt.v_kthreads); - PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize + + VM_CNT_INC(v_kthreads); + VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { - PCPU_INC(cnt.v_rforks); - PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize + + VM_CNT_INC(v_rforks); + VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif /* * Hold the process so that it cannot exit after we make it runnable, * but before we wait for the debugger. */ _PHOLD(p2); if (p1->p_ptevents & PTRACE_FORK) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; } if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } if ((fr->fr_flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; } else { *fr->fr_procp = p2; } PROC_LOCK(p2); /* * Wait until debugger is attached to child. */ while (td2->td_proc == p2 && (td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); _PRELE(p2); racct_proc_fork_done(p2); PROC_UNLOCK(p2); } int fork1(struct thread *td, struct fork_req *fr) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; static int curfail; static struct timeval lastfail; int flags, pages; flags = fr->fr_flags; pages = fr->fr_pages; if ((flags & RFSTOPPED) != 0) MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL); else MPASS(fr->fr_procp == NULL); /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (fr->fr_pd_fd == NULL) return (EINVAL); /* Check if we are using supported flags. */ if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (fr->fr_procp != NULL) *fr->fr_procp = NULL; else if (fr->fr_pidp != NULL) *fr->fr_pidp = 0; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd, fr->fr_pd_flags, fr->fr_pd_fcaps); if (error != 0) goto fail2; } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail2; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * subtracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ proc_set_cred_init(newproc, crhold(td->td_ucred)); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); sx_xlock(&allproc_lock); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC)); } if (ok) { do_fork(td, fr, newproc, td2, vm2, fp_procdesc); return (0); } error = EAGAIN; sx_sunlock(&proctree_lock); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: crfree(newproc->p_ucred); newproc->p_ucred = NULL; fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td_get_sched(td), p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_fork_kthread_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KPROC) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); td->td_pflags &= ~TDP_FORKING; } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. This function is passed in to fork_exit() * as the first parameter and is called when returning to a new * userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p, *dbg; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { sx_xlock(&proctree_lock); PROC_LOCK(p); if (p->p_pptr->p_ptevents & PTRACE_FORK) { /* * If debugger still wants auto-attach for the * parent's children, do it now. */ dbg = p->p_pptr->p_pptr; proc_set_traced(p, true); CTR2(KTR_PTRACE, "fork_return: attaching to new child pid %d: oppid %d", p->p_pid, p->p_oppid); proc_reparent(p, dbg); sx_xunlock(&proctree_lock); td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP; ptracestop(td, SIGSTOP, NULL); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ sx_xunlock(&proctree_lock); td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; _STOPEVENT(p, S_SCX, td->td_dbg_sc_code); if ((p->p_ptevents & PTRACE_SCX) != 0 || (td->td_dbgflags & TDB_BORN) != 0) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_BORN); PROC_UNLOCK(p); } userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif } Index: head/sys/kern/kern_intr.c =================================================================== --- head/sys/kern/kern_intr.c (revision 317060) +++ head/sys/kern/kern_intr.c (revision 317061) @@ -1,1934 +1,1934 @@ /*- * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif /* * Describe an interrupt thread. There is one of these per interrupt event. */ struct intr_thread { struct intr_event *it_event; struct thread *it_thread; /* Kernel thread. */ int it_flags; /* (j) IT_* flags. */ int it_need; /* Needs service. */ }; /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ #define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; uintptr_t event; }; struct intr_event *clk_intr_event; struct intr_event *tty_intr_event; void *vm_ih; struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static int intr_storm_threshold = 1000; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); static TAILQ_HEAD(, intr_event) event_list = TAILQ_HEAD_INITIALIZER(event_list); static struct mtx event_lock; MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF); static void intr_event_update(struct intr_event *ie); #ifdef INTR_FILTER static int intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *ithd); static int intr_filter_loop(struct intr_event *ie, struct trapframe *frame, struct intr_thread **ithd); static struct intr_thread *ithread_create(const char *name, struct intr_handler *ih); #else static int intr_event_schedule_thread(struct intr_event *ie); static struct intr_thread *ithread_create(const char *name); #endif static void ithread_destroy(struct intr_thread *ithread); static void ithread_execute_handlers(struct proc *p, struct intr_event *ie); #ifdef INTR_FILTER static void priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih); #endif static void ithread_loop(void *); static void ithread_update(struct intr_thread *ithd); static void start_softintr(void *); /* Map an interrupt type to an ithread priority. */ u_char intr_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTY; break; case INTR_TYPE_BIO: pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; break; case INTR_TYPE_AV: pri = PI_AV; break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("intr_priority: no interrupt type in flags"); } return pri; } /* * Update an ithread based on the associated intr_event. */ static void ithread_update(struct intr_thread *ithd) { struct intr_event *ie; struct thread *td; u_char pri; ie = ithd->it_event; td = ithd->it_thread; /* Determine the overall priority of this event. */ if (TAILQ_EMPTY(&ie->ie_handlers)) pri = PRI_MAX_ITHD; else pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri; /* Update name and priority. */ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif thread_lock(td); sched_prio(td, pri); thread_unlock(td); } /* * Regenerate the full name of an interrupt event and update its priority. */ static void intr_event_update(struct intr_event *ie) { struct intr_handler *ih; char *last; int missed, space; /* Start off with no entropy and just the name of the event. */ mtx_assert(&ie->ie_lock, MA_OWNED); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); ie->ie_flags &= ~IE_ENTROPY; missed = 0; space = 1; /* Run through all the handlers updating values. */ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 < sizeof(ie->ie_fullname)) { strcat(ie->ie_fullname, " "); strcat(ie->ie_fullname, ih->ih_name); space = 0; } else missed++; if (ih->ih_flags & IH_ENTROPY) ie->ie_flags |= IE_ENTROPY; } /* * If the handler names were too long, add +'s to indicate missing * names. If we run out of room and still have +'s to add, change * the last character from a + to a *. */ last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { if (*last == '+') { *last = '*'; break; } else *last = '+'; } else if (space) { strcat(ie->ie_fullname, " +"); space = 0; } else strcat(ie->ie_fullname, "+"); } /* * If this event has an ithread, update it's priority and * name. */ if (ie->ie_thread != NULL) ithread_update(ie->ie_thread); CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname); } int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) { struct intr_event *ie; va_list ap; /* The only valid flag during creation is IE_SOFT. */ if ((flags & ~IE_SOFT) != 0) return (EINVAL); ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_pre_ithread = pre_ithread; ie->ie_post_ithread = post_ithread; ie->ie_post_filter = post_filter; ie->ie_assign_cpu = assign_cpu; ie->ie_flags = flags; ie->ie_irq = irq; ie->ie_cpu = NOCPU; TAILQ_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); va_start(ap, fmt); vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap); va_end(ap); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); mtx_lock(&event_lock); TAILQ_INSERT_TAIL(&event_list, ie, ie_list); mtx_unlock(&event_lock); if (event != NULL) *event = ie; CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name); return (0); } /* * Bind an interrupt event to the specified CPU. Note that not all * platforms support binding an interrupt to a CPU. For those * platforms this request will fail. For supported platforms, any * associated ithreads as well as the primary interrupt context will * be bound to the specificed CPU. Using a cpu id of NOCPU unbinds * the interrupt event. */ int intr_event_bind(struct intr_event *ie, int cpu) { lwpid_t id; int error; /* Need a CPU to bind to. */ if (cpu != NOCPU && CPU_ABSENT(cpu)) return (EINVAL); if (ie->ie_assign_cpu == NULL) return (EOPNOTSUPP); error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR); if (error) return (error); /* * If we have any ithreads try to set their mask first to verify * permissions, etc. */ mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_setithread(id, cpu); if (error) return (error); } else mtx_unlock(&ie->ie_lock); error = ie->ie_assign_cpu(ie->ie_source, cpu); if (error) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { cpu = ie->ie_cpu; id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); (void)cpuset_setithread(id, cpu); } else mtx_unlock(&ie->ie_lock); return (error); } mtx_lock(&ie->ie_lock); ie->ie_cpu = cpu; mtx_unlock(&ie->ie_lock); return (error); } static struct intr_event * intr_lookup(int irq) { struct intr_event *ie; mtx_lock(&event_lock); TAILQ_FOREACH(ie, &event_list, ie_list) if (ie->ie_irq == irq && (ie->ie_flags & IE_SOFT) == 0 && TAILQ_FIRST(&ie->ie_handlers) != NULL) break; mtx_unlock(&event_lock); return (ie); } int intr_setaffinity(int irq, void *m) { struct intr_event *ie; cpuset_t *mask; int cpu, n; mask = m; cpu = NOCPU; /* * If we're setting all cpus we can unbind. Otherwise make sure * only one cpu is in the set. */ if (CPU_CMP(cpuset_root, mask)) { for (n = 0; n < CPU_SETSIZE; n++) { if (!CPU_ISSET(n, mask)) continue; if (cpu != NOCPU) return (EINVAL); cpu = n; } } ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); return (intr_event_bind(ie, cpu)); } int intr_getaffinity(int irq, void *m) { struct intr_event *ie; cpuset_t *mask; mask = m; ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); CPU_ZERO(mask); mtx_lock(&ie->ie_lock); if (ie->ie_cpu == NOCPU) CPU_COPY(cpuset_root, mask); else CPU_SET(ie->ie_cpu, mask); mtx_unlock(&ie->ie_lock); return (0); } int intr_event_destroy(struct intr_event *ie) { mtx_lock(&event_lock); mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); return (EBUSY); } TAILQ_REMOVE(&event_list, ie, ie_list); #ifndef notyet if (ie->ie_thread != NULL) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); mtx_destroy(&ie->ie_lock); free(ie, M_ITHREAD); return (0); } #ifndef INTR_FILTER static struct intr_thread * ithread_create(const char *name) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ithd, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } #else static struct intr_thread * ithread_create(const char *name, struct intr_handler *ih) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ih, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } #endif static void ithread_destroy(struct intr_thread *ithread) { struct thread *td; CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } thread_unlock(td); } #ifndef INTR_FILTER int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* Create a thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn"); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } /* Add the new handler to the event in priority order. */ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } if (temp_ih == NULL) TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next); else TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } #else int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!TAILQ_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* For filtered handlers, create a private ithread to run on. */ if (filter != NULL && handler != NULL) { mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn", ih); mtx_lock(&ie->ie_lock); it->it_event = ie; ih->ih_thread = it; ithread_update(it); /* XXX - do we really need this?!?!? */ } else { /* Create the global per-event thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn", ih); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } } /* Add the new handler to the event in priority order. */ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } if (temp_ih == NULL) TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next); else TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } #endif /* * Append a description preceded by a ':' to the name of the specified * interrupt handler. */ int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr) { struct intr_handler *ih; size_t space; char *start; mtx_lock(&ie->ie_lock); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih == cookie) break; } if (ih == NULL) { mtx_unlock(&ie->ie_lock); panic("handler %p not found in interrupt event %p", cookie, ie); } #endif ih = cookie; /* * Look for an existing description by checking for an * existing ":". This assumes device names do not include * colons. If one is found, prepare to insert the new * description at that point. If one is not found, find the * end of the name to use as the insertion point. */ start = strchr(ih->ih_name, ':'); if (start == NULL) start = strchr(ih->ih_name, 0); /* * See if there is enough remaining room in the string for the * description + ":". The "- 1" leaves room for the trailing * '\0'. The "+ 1" accounts for the colon. */ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1; if (strlen(descr) + 1 > space) { mtx_unlock(&ie->ie_lock); return (ENOSPC); } /* Append a colon followed by the description. */ *start = ':'; strcpy(start + 1, descr); intr_event_update(ie); mtx_unlock(&ie->ie_lock); return (0); } /* * Return the ie_source field from the intr_event an intr_handler is * associated with. */ void * intr_handler_source(void *cookie) { struct intr_handler *ih; struct intr_event *ie; ih = (struct intr_handler *)cookie; if (ih == NULL) return (NULL); ie = ih->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", ih->ih_name)); return (ie->ie_source); } /* * Sleep until an ithread finishes executing an interrupt handler. * * XXX Doesn't currently handle interrupt filters or fast interrupt * handlers. This is intended for compatibility with linux drivers * only. Do not use in BSD code. */ void _intr_drain(int irq) { struct intr_event *ie; struct intr_thread *ithd; struct thread *td; ie = intr_lookup(irq); if (ie == NULL) return; if (ie->ie_thread == NULL) return; ithd = ie->ie_thread; td = ithd->it_thread; /* * We set the flag and wait for it to be cleared to avoid * long delays with potentially busy interrupt handlers * were we to only sample TD_AWAITING_INTR() every tick. */ thread_lock(td); if (!TD_AWAITING_INTR(td)) { ithd->it_flags |= IT_WAIT; while (ithd->it_flags & IT_WAIT) { thread_unlock(td); pause("idrain", 1); thread_lock(td); } } thread_unlock(td); return; } #ifndef INTR_FILTER int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; #ifdef INVARIANTS struct intr_handler *ih; #endif #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) if (ih == handler) goto ok; mtx_unlock(&ie->ie_lock); panic("interrupt handler \"%s\" not found in interrupt event \"%s\"", ih->ih_name, ie->ie_name); ok: #endif /* * If there is no ithread, then just remove the handler and return. * XXX: Note that an INTR_FAST handler might be running on another * CPU! */ if (ie->ie_thread == NULL) { TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* * If the interrupt thread is already running, then just mark this * handler as being dead and let the ithread do the actual removal. * * During a cold boot while cold is set, msleep() does not sleep, * so we have to remove the handler here rather than letting the * thread do it. */ thread_lock(ie->ie_thread->it_thread); if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; /* * Ensure that the thread will process the handler list * again and remove this handler if it has already passed * it on the list. * * The release part of the following store ensures * that the update of ih_flags is ordered before the * it_need setting. See the comment before * atomic_cmpset_acq(&ithd->it_need, ...) operation in * the ithread_execute_handlers(). */ atomic_store_rel_int(&ie->ie_thread->it_need, 1); } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); thread_unlock(ie->ie_thread->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (!(ih->ih_flags & IH_FAST)) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } static int intr_event_schedule_thread(struct intr_event *ie) { struct intr_entropy entropy; struct intr_thread *it; struct thread *td; struct thread *ctd; struct proc *p; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || ie->ie_thread == NULL) return (EINVAL); ctd = curthread; it = ie->ie_thread; td = it->it_thread; p = td->td_proc; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_flags & IE_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT); } KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, td->td_name, it->it_need, td->td_state); } thread_unlock(td); return (0); } #else int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; struct intr_thread *it; #ifdef INVARIANTS struct intr_handler *ih; #endif #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); #ifdef INVARIANTS TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) if (ih == handler) goto ok; mtx_unlock(&ie->ie_lock); panic("interrupt handler \"%s\" not found in interrupt event \"%s\"", ih->ih_name, ie->ie_name); ok: #endif /* * If there are no ithreads (per event and per handler), then * just remove the handler and return. * XXX: Note that an INTR_FAST handler might be running on another CPU! */ if (ie->ie_thread == NULL && handler->ih_thread == NULL) { TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* Private or global ithread? */ it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread; /* * If the interrupt thread is already running, then just mark this * handler as being dead and let the ithread do the actual removal. * * During a cold boot while cold is set, msleep() does not sleep, * so we have to remove the handler here rather than letting the * thread do it. */ thread_lock(it->it_thread); if (!TD_AWAITING_INTR(it->it_thread) && !cold) { handler->ih_flags |= IH_DEAD; /* * Ensure that the thread will process the handler list * again and remove this handler if it has already passed * it on the list. * * The release part of the following store ensures * that the update of ih_flags is ordered before the * it_need setting. See the comment before * atomic_cmpset_acq(&ithd->it_need, ...) operation in * the ithread_execute_handlers(). */ atomic_store_rel_int(&it->it_need, 1); } else TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next); thread_unlock(it->it_thread); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); /* * At this point, the handler has been disconnected from the event, * so we can kill the private ithread if any. */ if (handler->ih_thread) { ithread_destroy(handler->ih_thread); handler->ih_thread = NULL; } intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (handler != NULL) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } static int intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it) { struct intr_entropy entropy; struct thread *td; struct thread *ctd; struct proc *p; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL) return (EINVAL); ctd = curthread; td = it->it_thread; p = td->td_proc; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_flags & IE_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT); } KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, p->p_pid, td->td_name, it->it_need, td->td_state); } thread_unlock(td); return (0); } #endif /* * Allow interrupt event binding for software interrupt handlers -- a no-op, * since interrupts are generated in software rather than being directed by * a PIC. */ static int swi_assign_cpu(void *arg, int cpu) { return (0); } /* * Add a software interrupt handler to a specified event. If a given event * is not specified, then a new event is created. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct intr_event *ie; int error; if (flags & INTR_ENTROPY) return (EINVAL); ie = (eventp != NULL) ? *eventp : NULL; if (ie != NULL) { if (!(ie->ie_flags & IE_SOFT)) return (EINVAL); } else { error = intr_event_create(&ie, NULL, IE_SOFT, 0, NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri); if (error) return (error); if (eventp != NULL) *eventp = ie; } error = intr_event_add_handler(ie, name, NULL, handler, arg, PI_SWI(pri), flags, cookiep); return (error); } /* * Schedule a software interrupt thread. */ void swi_sched(void *cookie, int flags) { struct intr_handler *ih = (struct intr_handler *)cookie; struct intr_event *ie = ih->ih_event; struct intr_entropy entropy; int error; CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name, ih->ih_need); entropy.event = (uintptr_t)ih; entropy.td = curthread; random_harvest_queue(&entropy, sizeof(entropy), 1, RANDOM_SWI); /* * Set ih_need for this handler so that if the ithread is already * running it will execute this handler on the next pass. Otherwise, * it will execute it the next time it runs. */ ih->ih_need = 1; if (!(flags & SWI_DELAY)) { - PCPU_INC(cnt.v_soft); + VM_CNT_INC(v_soft); #ifdef INTR_FILTER error = intr_event_schedule_thread(ie, ie->ie_thread); #else error = intr_event_schedule_thread(ie); #endif KASSERT(error == 0, ("stray software interrupt")); } } /* * Remove a software interrupt handler. Currently this code does not * remove the associated interrupt event if it becomes empty. Calling code * may do so manually via intr_event_destroy(), but that's not really * an optimal interface. */ int swi_remove(void *cookie) { return (intr_event_remove_handler(cookie)); } #ifdef INTR_FILTER static void priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih) { struct intr_event *ie; ie = ih->ih_event; /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); return; } /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } #endif /* * This is a public function for use by drivers that mux interrupt * handlers for child devices from their interrupt handler. */ void intr_event_execute_handlers(struct proc *p, struct intr_event *ie) { struct intr_handler *ih, *ihn; TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) { /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); continue; } /* Skip filter only handlers */ if (ih->ih_handler == NULL) continue; /* * For software interrupt threads, we only execute * handlers that have their need flag set. Hardware * interrupt threads always invoke all of their handlers. * * ih_need can only be 0 or 1. Failed cmpset below * means that there is no request to execute handlers, * so a retry of the cmpset is not needed. */ if ((ie->ie_flags & IE_SOFT) != 0 && atomic_cmpset_int(&ih->ih_need, 1, 0) == 0) continue; /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } } static void ithread_execute_handlers(struct proc *p, struct intr_event *ie) { /* Interrupt handlers should not sleep. */ if (!(ie->ie_flags & IE_SOFT)) THREAD_NO_SLEEPING(); intr_event_execute_handlers(p, ie); if (!(ie->ie_flags & IE_SOFT)) THREAD_SLEEPING_OK(); /* * Interrupt storm handling: * * If this interrupt source is currently storming, then throttle * it to only fire the handler once per clock tick. * * If this interrupt source is not currently storming, but the * number of back to back interrupts exceeds the storm threshold, * then enter storming mode. */ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold && !(ie->ie_flags & IE_SOFT)) { /* Report the message only once every second. */ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) { printf( "interrupt storm detected on \"%s\"; throttling interrupt source\n", ie->ie_name); } pause("istorm", 1); } else ie->ie_count++; /* * Now that all the handlers have had a chance to run, reenable * the interrupt source. */ if (ie->ie_post_ithread != NULL) ie->ie_post_ithread(ie->ie_source); } #ifndef INTR_FILTER /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct intr_thread *ithd; struct intr_event *ie; struct thread *td; struct proc *p; int wake; td = curthread; p = td->td_proc; ithd = (struct intr_thread *)arg; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) ithread_execute_handlers(p, ie); WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); if (wake) { wakeup(ithd); wake = 0; } } } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_handler *ih; struct trapframe *oldframe; struct thread *td; int error, ret, thread; td = curthread; #ifdef KSTACK_USAGE_PROF intr_prof_stack_use(td, frame); #endif /* An interrupt with no event or handlers is a stray interrupt. */ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers)) return (EINVAL); /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ td->td_intr_nesting_level++; thread = 0; ret = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_filter == NULL) { thread = 1; continue; } CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__, ih->ih_filter, ih->ih_argument == NULL ? frame : ih->ih_argument, ih->ih_name); if (ih->ih_argument == NULL) ret = ih->ih_filter(frame); else ret = ih->ih_filter(ih->ih_argument); KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); /* * Wrapper handler special handling: * * in some particular cases (like pccard and pccbb), * the _real_ device handler is wrapped in a couple of * functions - a filter wrapper and an ithread wrapper. * In this case (and just in this case), the filter wrapper * could ask the system to schedule the ithread and mask * the interrupt source if the wrapped handler is composed * of just an ithread handler. * * TODO: write a generic wrapper to avoid people rolling * their own */ if (!thread) { if (ret == FILTER_SCHEDULE_THREAD) thread = 1; } } td->td_intr_frame = oldframe; if (thread) { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } else { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } /* Schedule the ithread if needed. */ if (thread) { error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("bad stray interrupt")); } critical_exit(); td->td_intr_nesting_level--; return (0); } #else /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct intr_thread *ithd; struct intr_handler *ih; struct intr_event *ie; struct thread *td; struct proc *p; int priv; int wake; td = curthread; p = td->td_proc; ih = (struct intr_handler *)arg; priv = (ih->ih_thread != NULL) ? 1 : 0; ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) { if (priv) priv_ithread_execute_handler(p, ih); else ithread_execute_handlers(p, ie); } WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); if (wake) { wakeup(ithd); wake = 0; } } } /* * Main loop for interrupt filter. * * Some architectures (i386, amd64 and arm) require the optional frame * parameter, and use it as the main argument for fast handler execution * when ih_argument == NULL. * * Return value: * o FILTER_STRAY: No filter recognized the event, and no * filter-less handler is registered on this * line. * o FILTER_HANDLED: A filter claimed the event and served it. * o FILTER_SCHEDULE_THREAD: No filter claimed the event, but there's at * least one filter-less handler on this line. * o FILTER_HANDLED | * FILTER_SCHEDULE_THREAD: A filter claimed the event, and asked for * scheduling the per-handler ithread. * * In case an ithread has to be scheduled, in *ithd there will be a * pointer to a struct intr_thread containing the thread to be * scheduled. */ static int intr_filter_loop(struct intr_event *ie, struct trapframe *frame, struct intr_thread **ithd) { struct intr_handler *ih; void *arg; int ret, thread_only; ret = 0; thread_only = 0; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument); CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__, ih->ih_filter, ih->ih_handler, arg, ih->ih_name); if (ih->ih_filter != NULL) ret = ih->ih_filter(arg); else { thread_only = 1; continue; } KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); if (ret & FILTER_STRAY) continue; else { *ithd = ih->ih_thread; return (ret); } } /* * No filters handled the interrupt and we have at least * one handler without a filter. In this case, we schedule * all of the filter-less handlers to run in the ithread. */ if (thread_only) { *ithd = ie->ie_thread; return (FILTER_SCHEDULE_THREAD); } return (FILTER_STRAY); } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_thread *ithd; struct trapframe *oldframe; struct thread *td; int thread; ithd = NULL; td = curthread; if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers)) return (EINVAL); td->td_intr_nesting_level++; thread = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; thread = intr_filter_loop(ie, frame, &ithd); if (thread & FILTER_HANDLED) { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } else { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } td->td_intr_frame = oldframe; critical_exit(); /* Interrupt storm logic */ if (thread & FILTER_STRAY) { ie->ie_count++; if (ie->ie_count < intr_storm_threshold) printf("Interrupt stray detection not present\n"); } /* Schedule an ithread if needed. */ if (thread & FILTER_SCHEDULE_THREAD) { if (intr_event_schedule_thread(ie, ithd) != 0) panic("%s: impossible stray interrupt", __func__); } td->td_intr_nesting_level--; return (0); } #endif #ifdef DDB /* * Dump details about an interrupt handler */ static void db_dump_intrhand(struct intr_handler *ih) { int comma; db_printf("\t%-10s ", ih->ih_name); switch (ih->ih_pri) { case PI_REALTIME: db_printf("CLK "); break; case PI_AV: db_printf("AV "); break; case PI_TTY: db_printf("TTY "); break; case PI_NET: db_printf("NET "); break; case PI_DISK: db_printf("DISK"); break; case PI_DULL: db_printf("DULL"); break; default: if (ih->ih_pri >= PI_SOFT) db_printf("SWI "); else db_printf("%4u", ih->ih_pri); break; } db_printf(" "); if (ih->ih_filter != NULL) { db_printf("[F]"); db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC); } if (ih->ih_handler != NULL) { if (ih->ih_filter != NULL) db_printf(","); db_printf("[H]"); db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC); } db_printf("(%p)", ih->ih_argument); if (ih->ih_need || (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD | IH_MPSAFE)) != 0) { db_printf(" {"); comma = 0; if (ih->ih_flags & IH_EXCLUSIVE) { if (comma) db_printf(", "); db_printf("EXCL"); comma = 1; } if (ih->ih_flags & IH_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ih->ih_flags & IH_DEAD) { if (comma) db_printf(", "); db_printf("DEAD"); comma = 1; } if (ih->ih_flags & IH_MPSAFE) { if (comma) db_printf(", "); db_printf("MPSAFE"); comma = 1; } if (ih->ih_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); } /* * Dump details about a event. */ void db_dump_intr_event(struct intr_event *ie, int handlers) { struct intr_handler *ih; struct intr_thread *it; int comma; db_printf("%s ", ie->ie_fullname); it = ie->ie_thread; if (it != NULL) db_printf("(pid %d)", it->it_thread->td_proc->p_pid); else db_printf("(no thread)"); if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 || (it != NULL && it->it_need)) { db_printf(" {"); comma = 0; if (ie->ie_flags & IE_SOFT) { db_printf("SOFT"); comma = 1; } if (ie->ie_flags & IE_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ie->ie_flags & IE_ADDING_THREAD) { if (comma) db_printf(", "); db_printf("ADDING_THREAD"); comma = 1; } if (it != NULL && it->it_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); if (handlers) TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) db_dump_intrhand(ih); } /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(intr, db_show_intr) { struct intr_event *ie; int all, verbose; verbose = strchr(modif, 'v') != NULL; all = strchr(modif, 'a') != NULL; TAILQ_FOREACH(ie, &event_list, ie_list) { if (!all && TAILQ_EMPTY(&ie->ie_handlers)) continue; db_dump_intr_event(ie, verbose); if (db_pager_quit) break; } } #endif /* DDB */ /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih)) panic("died while creating vm swi ithread"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL); /* * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. * The data for this machine dependent, and the declarations are in machine * dependent code. The layout of intrnames and intrcnt however is machine * independent. * * We do not know the length of intrcnt and intrnames at compile time, so * calculate things at run time. */ static int sysctl_intrnames(SYSCTL_HANDLER_ARGS) { return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrnames, "", "Interrupt Names"); static int sysctl_intrcnt(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 uint32_t *intrcnt32; unsigned i; int error; if (req->flags & SCTL_MASK32) { if (!req->oldptr) return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req)); intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT); if (intrcnt32 == NULL) return (ENOMEM); for (i = 0; i < sintrcnt / sizeof (u_long); i++) intrcnt32[i] = intrcnt[i]; error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req); free(intrcnt32, M_TEMP); return (error); } #endif return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); #ifdef DDB /* * DDB command to dump the interrupt statistics. */ DB_SHOW_COMMAND(intrcnt, db_show_intrcnt) { u_long *i; char *cp; u_int j; cp = intrnames; j = 0; for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit; i++, j++) { if (*cp == '\0') break; if (*i != 0) db_printf("%s\t%lu\n", cp, *i); cp += strlen(cp) + 1; } } #endif Index: head/sys/kern/kern_synch.c =================================================================== --- head/sys/kern/kern_synch.c (revision 317060) +++ head/sys/kern/kern_synch.c (revision 317061) @@ -1,574 +1,574 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static uint8_t pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, ""); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); td = curthread; p = td->td_proc; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(ident != NULL, ("_sleep: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; if (SCHEDULER_STOPPED_TD(td)) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep")); if ((uint8_t *)ident >= &pause_wchan[0] && (uint8_t *)ident <= &pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } int msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct proc *p; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; p = td->td_proc; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running")); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause() delays the calling thread by the given number of system ticks. * During cold bootup, pause() uses the DELAY() function instead of * the tsleep() function to do the waiting. The "timo" argument must be * greater than or equal to zero. A "timo" value of zero is equivalent * to a "timo" value of one. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if ((cold && curthread == &thread0) || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (0); } return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * The machine independent parts of context switching. */ void mi_switch(int flags, struct thread *newtd) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || panicstr, ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); if (SCHEDULER_STOPPED_TD(td)) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ - PCPU_INC(cnt.v_swtch); + VM_CNT_INC(v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #ifdef KDTRACE_HOOKS if ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 && (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED)) SDT_PROBE0(sched, , , preempt); #endif sched_switch(td, newtd, flags); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. */ int setrunnable(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return (0); /* FALLTHROUGH */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((td->td_flags & TDF_INMEM) == 0) { if ((td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; return (1); } } else sched_wakeup(td); return (0); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } /* ARGSUSED */ static void synch_setup(void *dummy) { callout_init(&loadav_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } int should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); td->td_retval[0] = 0; return (0); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 317060) +++ head/sys/kern/kern_thread.c (revision 317061) @@ -1,1209 +1,1209 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); - PCPU_INC(cnt.v_swtch); + VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: head/sys/kern/subr_intr.c =================================================================== --- head/sys/kern/subr_intr.c (revision 317060) +++ head/sys/kern/subr_intr.c (revision 317061) @@ -1,1660 +1,1660 @@ /*- * Copyright (c) 2015-2016 Svatopluk Kraus * Copyright (c) 2015-2016 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * New-style Interrupt Framework * * TODO: - add support for disconnected PICs. * - to support IPI (PPI) enabling on other CPUs if already started. * - to complete things for removable PICs. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #ifdef DDB #include #endif #include "pic_if.h" #include "msi_if.h" #define INTRNAME_LEN (2*MAXCOMLEN + 1) #ifdef DEBUG #define debugf(fmt, args...) do { printf("%s(): ", __func__); \ printf(fmt,##args); } while (0) #else #define debugf(fmt, args...) #endif MALLOC_DECLARE(M_INTRNG); MALLOC_DEFINE(M_INTRNG, "intr", "intr interrupt handling"); /* Main interrupt handler called from assembler -> 'hidden' for C code. */ void intr_irq_handler(struct trapframe *tf); /* Root interrupt controller stuff. */ device_t intr_irq_root_dev; static intr_irq_filter_t *irq_root_filter; static void *irq_root_arg; static u_int irq_root_ipicount; struct intr_pic_child { SLIST_ENTRY(intr_pic_child) pc_next; struct intr_pic *pc_pic; intr_child_irq_filter_t *pc_filter; void *pc_filter_arg; uintptr_t pc_start; uintptr_t pc_length; }; /* Interrupt controller definition. */ struct intr_pic { SLIST_ENTRY(intr_pic) pic_next; intptr_t pic_xref; /* hardware identification */ device_t pic_dev; /* Only one of FLAG_PIC or FLAG_MSI may be set */ #define FLAG_PIC (1 << 0) #define FLAG_MSI (1 << 1) #define FLAG_TYPE_MASK (FLAG_PIC | FLAG_MSI) u_int pic_flags; struct mtx pic_child_lock; SLIST_HEAD(, intr_pic_child) pic_children; }; static struct mtx pic_list_lock; static SLIST_HEAD(, intr_pic) pic_list; static struct intr_pic *pic_lookup(device_t dev, intptr_t xref, int flags); /* Interrupt source definition. */ static struct mtx isrc_table_lock; static struct intr_irqsrc *irq_sources[NIRQ]; u_int irq_next_free; #ifdef SMP static boolean_t irq_assign_cpu = FALSE; #endif /* * - 2 counters for each I/O interrupt. * - MAXCPU counters for each IPI counters for SMP. */ #ifdef SMP #define INTRCNT_COUNT (NIRQ * 2 + INTR_IPI_COUNT * MAXCPU) #else #define INTRCNT_COUNT (NIRQ * 2) #endif /* Data for MI statistics reporting. */ u_long intrcnt[INTRCNT_COUNT]; char intrnames[INTRCNT_COUNT * INTRNAME_LEN]; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); static u_int intrcnt_index; static struct intr_irqsrc *intr_map_get_isrc(u_int res_id); static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc); static struct intr_map_data * intr_map_get_map_data(u_int res_id); static void intr_map_copy_map_data(u_int res_id, device_t *dev, intptr_t *xref, struct intr_map_data **data); /* * Interrupt framework initialization routine. */ static void intr_irq_init(void *dummy __unused) { SLIST_INIT(&pic_list); mtx_init(&pic_list_lock, "intr pic list", NULL, MTX_DEF); mtx_init(&isrc_table_lock, "intr isrc table", NULL, MTX_DEF); } SYSINIT(intr_irq_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_irq_init, NULL); static void intrcnt_setname(const char *name, int index) { snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s", INTRNAME_LEN - 1, name); } /* * Update name for interrupt source with interrupt event. */ static void intrcnt_updatename(struct intr_irqsrc *isrc) { /* QQQ: What about stray counter name? */ mtx_assert(&isrc_table_lock, MA_OWNED); intrcnt_setname(isrc->isrc_event->ie_fullname, isrc->isrc_index); } /* * Virtualization for interrupt source interrupt counter increment. */ static inline void isrc_increment_count(struct intr_irqsrc *isrc) { if (isrc->isrc_flags & INTR_ISRCF_PPI) atomic_add_long(&isrc->isrc_count[0], 1); else isrc->isrc_count[0]++; } /* * Virtualization for interrupt source interrupt stray counter increment. */ static inline void isrc_increment_straycount(struct intr_irqsrc *isrc) { isrc->isrc_count[1]++; } /* * Virtualization for interrupt source interrupt name update. */ static void isrc_update_name(struct intr_irqsrc *isrc, const char *name) { char str[INTRNAME_LEN]; mtx_assert(&isrc_table_lock, MA_OWNED); if (name != NULL) { snprintf(str, INTRNAME_LEN, "%s: %s", isrc->isrc_name, name); intrcnt_setname(str, isrc->isrc_index); snprintf(str, INTRNAME_LEN, "stray %s: %s", isrc->isrc_name, name); intrcnt_setname(str, isrc->isrc_index + 1); } else { snprintf(str, INTRNAME_LEN, "%s:", isrc->isrc_name); intrcnt_setname(str, isrc->isrc_index); snprintf(str, INTRNAME_LEN, "stray %s:", isrc->isrc_name); intrcnt_setname(str, isrc->isrc_index + 1); } } /* * Virtualization for interrupt source interrupt counters setup. */ static void isrc_setup_counters(struct intr_irqsrc *isrc) { u_int index; /* * XXX - it does not work well with removable controllers and * interrupt sources !!! */ index = atomic_fetchadd_int(&intrcnt_index, 2); isrc->isrc_index = index; isrc->isrc_count = &intrcnt[index]; isrc_update_name(isrc, NULL); } /* * Virtualization for interrupt source interrupt counters release. */ static void isrc_release_counters(struct intr_irqsrc *isrc) { panic("%s: not implemented", __func__); } #ifdef SMP /* * Virtualization for interrupt source IPI counters setup. */ u_long * intr_ipi_setup_counters(const char *name) { u_int index, i; char str[INTRNAME_LEN]; index = atomic_fetchadd_int(&intrcnt_index, MAXCPU); for (i = 0; i < MAXCPU; i++) { snprintf(str, INTRNAME_LEN, "cpu%d:%s", i, name); intrcnt_setname(str, index + i); } return (&intrcnt[index]); } #endif /* * Main interrupt dispatch handler. It's called straight * from the assembler, where CPU interrupt is served. */ void intr_irq_handler(struct trapframe *tf) { struct trapframe * oldframe; struct thread * td; KASSERT(irq_root_filter != NULL, ("%s: no filter", __func__)); - PCPU_INC(cnt.v_intr); + VM_CNT_INC(v_intr); critical_enter(); td = curthread; oldframe = td->td_intr_frame; td->td_intr_frame = tf; irq_root_filter(irq_root_arg); td->td_intr_frame = oldframe; critical_exit(); #ifdef HWPMC_HOOKS if (pmc_hook && TRAPF_USERMODE(tf) && (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN)) pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, tf); #endif } int intr_child_irq_handler(struct intr_pic *parent, uintptr_t irq) { struct intr_pic_child *child; bool found; found = false; mtx_lock_spin(&parent->pic_child_lock); SLIST_FOREACH(child, &parent->pic_children, pc_next) { if (child->pc_start <= irq && irq < (child->pc_start + child->pc_length)) { found = true; break; } } mtx_unlock_spin(&parent->pic_child_lock); if (found) return (child->pc_filter(child->pc_filter_arg, irq)); return (FILTER_STRAY); } /* * interrupt controller dispatch function for interrupts. It should * be called straight from the interrupt controller, when associated interrupt * source is learned. */ int intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf) { KASSERT(isrc != NULL, ("%s: no source", __func__)); isrc_increment_count(isrc); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { int error; error = isrc->isrc_filter(isrc->isrc_arg, tf); PIC_POST_FILTER(isrc->isrc_dev, isrc); if (error == FILTER_HANDLED) return (0); } else #endif if (isrc->isrc_event != NULL) { if (intr_event_handle(isrc->isrc_event, tf) == 0) return (0); } isrc_increment_straycount(isrc); return (EINVAL); } /* * Alloc unique interrupt number (resource handle) for interrupt source. * * There could be various strategies how to allocate free interrupt number * (resource handle) for new interrupt source. * * 1. Handles are always allocated forward, so handles are not recycled * immediately. However, if only one free handle left which is reused * constantly... */ static inline int isrc_alloc_irq(struct intr_irqsrc *isrc) { u_int maxirqs, irq; mtx_assert(&isrc_table_lock, MA_OWNED); maxirqs = nitems(irq_sources); if (irq_next_free >= maxirqs) return (ENOSPC); for (irq = irq_next_free; irq < maxirqs; irq++) { if (irq_sources[irq] == NULL) goto found; } for (irq = 0; irq < irq_next_free; irq++) { if (irq_sources[irq] == NULL) goto found; } irq_next_free = maxirqs; return (ENOSPC); found: isrc->isrc_irq = irq; irq_sources[irq] = isrc; irq_next_free = irq + 1; if (irq_next_free >= maxirqs) irq_next_free = 0; return (0); } /* * Free unique interrupt number (resource handle) from interrupt source. */ static inline int isrc_free_irq(struct intr_irqsrc *isrc) { mtx_assert(&isrc_table_lock, MA_OWNED); if (isrc->isrc_irq >= nitems(irq_sources)) return (EINVAL); if (irq_sources[isrc->isrc_irq] != isrc) return (EINVAL); irq_sources[isrc->isrc_irq] = NULL; isrc->isrc_irq = INTR_IRQ_INVALID; /* just to be safe */ return (0); } /* * Initialize interrupt source and register it into global interrupt table. */ int intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags, const char *fmt, ...) { int error; va_list ap; bzero(isrc, sizeof(struct intr_irqsrc)); isrc->isrc_dev = dev; isrc->isrc_irq = INTR_IRQ_INVALID; /* just to be safe */ isrc->isrc_flags = flags; va_start(ap, fmt); vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap); va_end(ap); mtx_lock(&isrc_table_lock); error = isrc_alloc_irq(isrc); if (error != 0) { mtx_unlock(&isrc_table_lock); return (error); } /* * Setup interrupt counters, but not for IPI sources. Those are setup * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust * our counter pool. */ if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) isrc_setup_counters(isrc); mtx_unlock(&isrc_table_lock); return (0); } /* * Deregister interrupt source from global interrupt table. */ int intr_isrc_deregister(struct intr_irqsrc *isrc) { int error; mtx_lock(&isrc_table_lock); if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) isrc_release_counters(isrc); error = isrc_free_irq(isrc); mtx_unlock(&isrc_table_lock); return (error); } #ifdef SMP /* * A support function for a PIC to decide if provided ISRC should be inited * on given cpu. The logic of INTR_ISRCF_BOUND flag and isrc_cpu member of * struct intr_irqsrc is the following: * * If INTR_ISRCF_BOUND is set, the ISRC should be inited only on cpus * set in isrc_cpu. If not, the ISRC should be inited on every cpu and * isrc_cpu is kept consistent with it. Thus isrc_cpu is always correct. */ bool intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu) { if (isrc->isrc_handlers == 0) return (false); if ((isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) == 0) return (false); if (isrc->isrc_flags & INTR_ISRCF_BOUND) return (CPU_ISSET(cpu, &isrc->isrc_cpu)); CPU_SET(cpu, &isrc->isrc_cpu); return (true); } #endif #ifdef INTR_SOLO /* * Setup filter into interrupt source. */ static int iscr_setup_filter(struct intr_irqsrc *isrc, const char *name, intr_irq_filter_t *filter, void *arg, void **cookiep) { if (filter == NULL) return (EINVAL); mtx_lock(&isrc_table_lock); /* * Make sure that we do not mix the two ways * how we handle interrupt sources. */ if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) { mtx_unlock(&isrc_table_lock); return (EBUSY); } isrc->isrc_filter = filter; isrc->isrc_arg = arg; isrc_update_name(isrc, name); mtx_unlock(&isrc_table_lock); *cookiep = isrc; return (0); } #endif /* * Interrupt source pre_ithread method for MI interrupt framework. */ static void intr_isrc_pre_ithread(void *arg) { struct intr_irqsrc *isrc = arg; PIC_PRE_ITHREAD(isrc->isrc_dev, isrc); } /* * Interrupt source post_ithread method for MI interrupt framework. */ static void intr_isrc_post_ithread(void *arg) { struct intr_irqsrc *isrc = arg; PIC_POST_ITHREAD(isrc->isrc_dev, isrc); } /* * Interrupt source post_filter method for MI interrupt framework. */ static void intr_isrc_post_filter(void *arg) { struct intr_irqsrc *isrc = arg; PIC_POST_FILTER(isrc->isrc_dev, isrc); } /* * Interrupt source assign_cpu method for MI interrupt framework. */ static int intr_isrc_assign_cpu(void *arg, int cpu) { #ifdef SMP struct intr_irqsrc *isrc = arg; int error; if (isrc->isrc_dev != intr_irq_root_dev) return (EINVAL); mtx_lock(&isrc_table_lock); if (cpu == NOCPU) { CPU_ZERO(&isrc->isrc_cpu); isrc->isrc_flags &= ~INTR_ISRCF_BOUND; } else { CPU_SETOF(cpu, &isrc->isrc_cpu); isrc->isrc_flags |= INTR_ISRCF_BOUND; } /* * In NOCPU case, it's up to PIC to either leave ISRC on same CPU or * re-balance it to another CPU or enable it on more CPUs. However, * PIC is expected to change isrc_cpu appropriately to keep us well * informed if the call is successful. */ if (irq_assign_cpu) { error = PIC_BIND_INTR(isrc->isrc_dev, isrc); if (error) { CPU_ZERO(&isrc->isrc_cpu); mtx_unlock(&isrc_table_lock); return (error); } } mtx_unlock(&isrc_table_lock); return (0); #else return (EOPNOTSUPP); #endif } /* * Create interrupt event for interrupt source. */ static int isrc_event_create(struct intr_irqsrc *isrc) { struct intr_event *ie; int error; error = intr_event_create(&ie, isrc, 0, isrc->isrc_irq, intr_isrc_pre_ithread, intr_isrc_post_ithread, intr_isrc_post_filter, intr_isrc_assign_cpu, "%s:", isrc->isrc_name); if (error) return (error); mtx_lock(&isrc_table_lock); /* * Make sure that we do not mix the two ways * how we handle interrupt sources. Let contested event wins. */ #ifdef INTR_SOLO if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) { #else if (isrc->isrc_event != NULL) { #endif mtx_unlock(&isrc_table_lock); intr_event_destroy(ie); return (isrc->isrc_event != NULL ? EBUSY : 0); } isrc->isrc_event = ie; mtx_unlock(&isrc_table_lock); return (0); } #ifdef notyet /* * Destroy interrupt event for interrupt source. */ static void isrc_event_destroy(struct intr_irqsrc *isrc) { struct intr_event *ie; mtx_lock(&isrc_table_lock); ie = isrc->isrc_event; isrc->isrc_event = NULL; mtx_unlock(&isrc_table_lock); if (ie != NULL) intr_event_destroy(ie); } #endif /* * Add handler to interrupt source. */ static int isrc_add_handler(struct intr_irqsrc *isrc, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) { int error; if (isrc->isrc_event == NULL) { error = isrc_event_create(isrc); if (error) return (error); } error = intr_event_add_handler(isrc->isrc_event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); if (error == 0) { mtx_lock(&isrc_table_lock); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } /* * Lookup interrupt controller locked. */ static inline struct intr_pic * pic_lookup_locked(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_assert(&pic_list_lock, MA_OWNED); if (dev == NULL && xref == 0) return (NULL); /* Note that pic->pic_dev is never NULL on registered PIC. */ SLIST_FOREACH(pic, &pic_list, pic_next) { if ((pic->pic_flags & FLAG_TYPE_MASK) != (flags & FLAG_TYPE_MASK)) continue; if (dev == NULL) { if (xref == pic->pic_xref) return (pic); } else if (xref == 0 || pic->pic_xref == 0) { if (dev == pic->pic_dev) return (pic); } else if (xref == pic->pic_xref && dev == pic->pic_dev) return (pic); } return (NULL); } /* * Lookup interrupt controller. */ static struct intr_pic * pic_lookup(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); mtx_unlock(&pic_list_lock); return (pic); } /* * Create interrupt controller. */ static struct intr_pic * pic_create(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); if (pic != NULL) { mtx_unlock(&pic_list_lock); return (pic); } pic = malloc(sizeof(*pic), M_INTRNG, M_NOWAIT | M_ZERO); if (pic == NULL) { mtx_unlock(&pic_list_lock); return (NULL); } pic->pic_xref = xref; pic->pic_dev = dev; pic->pic_flags = flags; mtx_init(&pic->pic_child_lock, "pic child lock", NULL, MTX_SPIN); SLIST_INSERT_HEAD(&pic_list, pic, pic_next); mtx_unlock(&pic_list_lock); return (pic); } #ifdef notyet /* * Destroy interrupt controller. */ static void pic_destroy(device_t dev, intptr_t xref, int flags) { struct intr_pic *pic; mtx_lock(&pic_list_lock); pic = pic_lookup_locked(dev, xref, flags); if (pic == NULL) { mtx_unlock(&pic_list_lock); return; } SLIST_REMOVE(&pic_list, pic, intr_pic, pic_next); mtx_unlock(&pic_list_lock); free(pic, M_INTRNG); } #endif /* * Register interrupt controller. */ struct intr_pic * intr_pic_register(device_t dev, intptr_t xref) { struct intr_pic *pic; if (dev == NULL) return (NULL); pic = pic_create(dev, xref, FLAG_PIC); if (pic == NULL) return (NULL); debugf("PIC %p registered for %s \n", pic, device_get_nameunit(dev), dev, xref); return (pic); } /* * Unregister interrupt controller. */ int intr_pic_deregister(device_t dev, intptr_t xref) { panic("%s: not implemented", __func__); } /* * Mark interrupt controller (itself) as a root one. * * Note that only an interrupt controller can really know its position * in interrupt controller's tree. So root PIC must claim itself as a root. * * In FDT case, according to ePAPR approved version 1.1 from 08 April 2011, * page 30: * "The root of the interrupt tree is determined when traversal * of the interrupt tree reaches an interrupt controller node without * an interrupts property and thus no explicit interrupt parent." */ int intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter, void *arg, u_int ipicount) { struct intr_pic *pic; pic = pic_lookup(dev, xref, FLAG_PIC); if (pic == NULL) { device_printf(dev, "not registered\n"); return (EINVAL); } KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC, ("%s: Found a non-PIC controller: %s", __func__, device_get_name(pic->pic_dev))); if (filter == NULL) { device_printf(dev, "filter missing\n"); return (EINVAL); } /* * Only one interrupt controllers could be on the root for now. * Note that we further suppose that there is not threaded interrupt * routine (handler) on the root. See intr_irq_handler(). */ if (intr_irq_root_dev != NULL) { device_printf(dev, "another root already set\n"); return (EBUSY); } intr_irq_root_dev = dev; irq_root_filter = filter; irq_root_arg = arg; irq_root_ipicount = ipicount; debugf("irq root set to %s\n", device_get_nameunit(dev)); return (0); } /* * Add a handler to manage a sub range of a parents interrupts. */ struct intr_pic * intr_pic_add_handler(device_t parent, struct intr_pic *pic, intr_child_irq_filter_t *filter, void *arg, uintptr_t start, uintptr_t length) { struct intr_pic *parent_pic; struct intr_pic_child *newchild; #ifdef INVARIANTS struct intr_pic_child *child; #endif /* Find the parent PIC */ parent_pic = pic_lookup(parent, 0, FLAG_PIC); if (parent_pic == NULL) return (NULL); newchild = malloc(sizeof(*newchild), M_INTRNG, M_WAITOK | M_ZERO); newchild->pc_pic = pic; newchild->pc_filter = filter; newchild->pc_filter_arg = arg; newchild->pc_start = start; newchild->pc_length = length; mtx_lock_spin(&parent_pic->pic_child_lock); #ifdef INVARIANTS SLIST_FOREACH(child, &parent_pic->pic_children, pc_next) { KASSERT(child->pc_pic != pic, ("%s: Adding a child PIC twice", __func__)); } #endif SLIST_INSERT_HEAD(&parent_pic->pic_children, newchild, pc_next); mtx_unlock_spin(&parent_pic->pic_child_lock); return (pic); } static int intr_resolve_irq(device_t dev, intptr_t xref, struct intr_map_data *data, struct intr_irqsrc **isrc) { struct intr_pic *pic; struct intr_map_data_msi *msi; if (data == NULL) return (EINVAL); pic = pic_lookup(dev, xref, (data->type == INTR_MAP_DATA_MSI) ? FLAG_MSI : FLAG_PIC); if (pic == NULL) return (ESRCH); switch (data->type) { case INTR_MAP_DATA_MSI: KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); msi = (struct intr_map_data_msi *)data; *isrc = msi->isrc; return (0); default: KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC, ("%s: Found a non-PIC controller: %s", __func__, device_get_name(pic->pic_dev))); return (PIC_MAP_INTR(pic->pic_dev, data, isrc)); } } int intr_activate_irq(device_t dev, struct resource *res) { device_t map_dev; intptr_t map_xref; struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; int error; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); if (intr_map_get_isrc(res_id) != NULL) panic("Attempt to double activation of resource id: %u\n", res_id); intr_map_copy_map_data(res_id, &map_dev, &map_xref, &data); error = intr_resolve_irq(map_dev, map_xref, data, &isrc); if (error != 0) { free(data, M_INTRNG); /* XXX TODO DISCONECTED PICs */ /* if (error == EINVAL) return(0); */ return (error); } intr_map_set_isrc(res_id, isrc); rman_set_virtual(res, data); return (PIC_ACTIVATE_INTR(isrc->isrc_dev, isrc, res, data)); } int intr_deactivate_irq(device_t dev, struct resource *res) { struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; int error; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL) panic("Attempt to deactivate non-active resource id: %u\n", res_id); data = rman_get_virtual(res); error = PIC_DEACTIVATE_INTR(isrc->isrc_dev, isrc, res, data); intr_map_set_isrc(res_id, NULL); rman_set_virtual(res, NULL); free(data, M_INTRNG); return (error); } int intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt, driver_intr_t hand, void *arg, int flags, void **cookiep) { int error; struct intr_map_data *data; struct intr_irqsrc *isrc; const char *name; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL) { /* XXX TODO DISCONECTED PICs */ return (EINVAL); } data = rman_get_virtual(res); name = device_get_nameunit(dev); #ifdef INTR_SOLO /* * Standard handling is done through MI interrupt framework. However, * some interrupts could request solely own special handling. This * non standard handling can be used for interrupt controllers without * handler (filter only), so in case that interrupt controllers are * chained, MI interrupt framework is called only in leaf controller. * * Note that root interrupt controller routine is served as well, * however in intr_irq_handler(), i.e. main system dispatch routine. */ if (flags & INTR_SOLO && hand != NULL) { debugf("irq %u cannot solo on %s\n", irq, name); return (EINVAL); } if (flags & INTR_SOLO) { error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt, arg, cookiep); debugf("irq %u setup filter error %d on %s\n", irq, error, name); } else #endif { error = isrc_add_handler(isrc, name, filt, hand, arg, flags, cookiep); debugf("irq %u add handler error %d on %s\n", irq, error, name); } if (error != 0) return (error); mtx_lock(&isrc_table_lock); error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data); if (error == 0) { isrc->isrc_handlers++; if (isrc->isrc_handlers == 1) PIC_ENABLE_INTR(isrc->isrc_dev, isrc); } mtx_unlock(&isrc_table_lock); if (error != 0) intr_event_remove_handler(*cookiep); return (error); } int intr_teardown_irq(device_t dev, struct resource *res, void *cookie) { int error; struct intr_map_data *data; struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); data = rman_get_virtual(res); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { if (isrc != cookie) return (EINVAL); mtx_lock(&isrc_table_lock); isrc->isrc_filter = NULL; isrc->isrc_arg = NULL; isrc->isrc_handlers = 0; PIC_DISABLE_INTR(isrc->isrc_dev, isrc); PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); isrc_update_name(isrc, NULL); mtx_unlock(&isrc_table_lock); return (0); } #endif if (isrc != intr_handler_source(cookie)) return (EINVAL); error = intr_event_remove_handler(cookie); if (error == 0) { mtx_lock(&isrc_table_lock); isrc->isrc_handlers--; if (isrc->isrc_handlers == 0) PIC_DISABLE_INTR(isrc->isrc_dev, isrc); PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } int intr_describe_irq(device_t dev, struct resource *res, void *cookie, const char *descr) { int error; struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) { if (isrc != cookie) return (EINVAL); mtx_lock(&isrc_table_lock); isrc_update_name(isrc, descr); mtx_unlock(&isrc_table_lock); return (0); } #endif error = intr_event_describe_handler(isrc->isrc_event, cookie, descr); if (error == 0) { mtx_lock(&isrc_table_lock); intrcnt_updatename(isrc); mtx_unlock(&isrc_table_lock); } return (error); } #ifdef SMP int intr_bind_irq(device_t dev, struct resource *res, int cpu) { struct intr_irqsrc *isrc; u_int res_id; KASSERT(rman_get_start(res) == rman_get_end(res), ("%s: more interrupts in resource", __func__)); res_id = (u_int)rman_get_start(res); isrc = intr_map_get_isrc(res_id); if (isrc == NULL || isrc->isrc_handlers == 0) return (EINVAL); #ifdef INTR_SOLO if (isrc->isrc_filter != NULL) return (intr_isrc_assign_cpu(isrc, cpu)); #endif return (intr_event_bind(isrc->isrc_event, cpu)); } /* * Return the CPU that the next interrupt source should use. * For now just returns the next CPU according to round-robin. */ u_int intr_irq_next_cpu(u_int last_cpu, cpuset_t *cpumask) { if (!irq_assign_cpu || mp_ncpus == 1) return (PCPU_GET(cpuid)); do { last_cpu++; if (last_cpu > mp_maxid) last_cpu = 0; } while (!CPU_ISSET(last_cpu, cpumask)); return (last_cpu); } /* * Distribute all the interrupt sources among the available * CPUs once the AP's have been launched. */ static void intr_irq_shuffle(void *arg __unused) { struct intr_irqsrc *isrc; u_int i; if (mp_ncpus == 1) return; mtx_lock(&isrc_table_lock); irq_assign_cpu = TRUE; for (i = 0; i < NIRQ; i++) { isrc = irq_sources[i]; if (isrc == NULL || isrc->isrc_handlers == 0 || isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) continue; if (isrc->isrc_event != NULL && isrc->isrc_flags & INTR_ISRCF_BOUND && isrc->isrc_event->ie_cpu != CPU_FFS(&isrc->isrc_cpu) - 1) panic("%s: CPU inconsistency", __func__); if ((isrc->isrc_flags & INTR_ISRCF_BOUND) == 0) CPU_ZERO(&isrc->isrc_cpu); /* start again */ /* * We are in wicked position here if the following call fails * for bound ISRC. The best thing we can do is to clear * isrc_cpu so inconsistency with ie_cpu will be detectable. */ if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0) CPU_ZERO(&isrc->isrc_cpu); } mtx_unlock(&isrc_table_lock); } SYSINIT(intr_irq_shuffle, SI_SUB_SMP, SI_ORDER_SECOND, intr_irq_shuffle, NULL); #else u_int intr_irq_next_cpu(u_int current_cpu, cpuset_t *cpumask) { return (PCPU_GET(cpuid)); } #endif /* * Allocate memory for new intr_map_data structure. * Initialize common fields. */ struct intr_map_data * intr_alloc_map_data(enum intr_map_data_type type, size_t len, int flags) { struct intr_map_data *data; data = malloc(len, M_INTRNG, flags); data->type = type; data->len = len; return (data); } void intr_free_intr_map_data(struct intr_map_data *data) { free(data, M_INTRNG); } /* * Register a MSI/MSI-X interrupt controller */ int intr_msi_register(device_t dev, intptr_t xref) { struct intr_pic *pic; if (dev == NULL) return (EINVAL); pic = pic_create(dev, xref, FLAG_MSI); if (pic == NULL) return (ENOMEM); debugf("PIC %p registered for %s \n", pic, device_get_nameunit(dev), dev, (uintmax_t)xref); return (0); } int intr_alloc_msi(device_t pci, device_t child, intptr_t xref, int count, int maxcount, int *irqs) { struct intr_irqsrc **isrc; struct intr_pic *pic; device_t pdev; struct intr_map_data_msi *msi; int err, i; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK); err = MSI_ALLOC_MSI(pic->pic_dev, child, count, maxcount, &pdev, isrc); if (err != 0) { free(isrc, M_INTRNG); return (err); } for (i = 0; i < count; i++) { msi = (struct intr_map_data_msi *)intr_alloc_map_data( INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO); msi-> isrc = isrc[i]; irqs[i] = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi); } free(isrc, M_INTRNG); return (err); } int intr_release_msi(device_t pci, device_t child, intptr_t xref, int count, int *irqs) { struct intr_irqsrc **isrc; struct intr_pic *pic; struct intr_map_data_msi *msi; int i, err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK); for (i = 0; i < count; i++) { msi = (struct intr_map_data_msi *) intr_map_get_map_data(irqs[i]); KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI, ("%s: irq %d map data is not MSI", __func__, irqs[i])); isrc[i] = msi->isrc; } err = MSI_RELEASE_MSI(pic->pic_dev, child, count, isrc); for (i = 0; i < count; i++) { if (isrc[i] != NULL) intr_unmap_irq(irqs[i]); } free(isrc, M_INTRNG); return (err); } int intr_alloc_msix(device_t pci, device_t child, intptr_t xref, int *irq) { struct intr_irqsrc *isrc; struct intr_pic *pic; device_t pdev; struct intr_map_data_msi *msi; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); err = MSI_ALLOC_MSIX(pic->pic_dev, child, &pdev, &isrc); if (err != 0) return (err); msi = (struct intr_map_data_msi *)intr_alloc_map_data( INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO); msi->isrc = isrc; *irq = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi); return (0); } int intr_release_msix(device_t pci, device_t child, intptr_t xref, int irq) { struct intr_irqsrc *isrc; struct intr_pic *pic; struct intr_map_data_msi *msi; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); msi = (struct intr_map_data_msi *) intr_map_get_map_data(irq); KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI, ("%s: irq %d map data is not MSI", __func__, irq)); isrc = msi->isrc; if (isrc == NULL) { intr_unmap_irq(irq); return (EINVAL); } err = MSI_RELEASE_MSIX(pic->pic_dev, child, isrc); intr_unmap_irq(irq); return (err); } int intr_map_msi(device_t pci, device_t child, intptr_t xref, int irq, uint64_t *addr, uint32_t *data) { struct intr_irqsrc *isrc; struct intr_pic *pic; int err; pic = pic_lookup(NULL, xref, FLAG_MSI); if (pic == NULL) return (ESRCH); KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI, ("%s: Found a non-MSI controller: %s", __func__, device_get_name(pic->pic_dev))); isrc = intr_map_get_isrc(irq); if (isrc == NULL) return (EINVAL); err = MSI_MAP_MSI(pic->pic_dev, child, isrc, addr, data); return (err); } void dosoftints(void); void dosoftints(void) { } #ifdef SMP /* * Init interrupt controller on another CPU. */ void intr_pic_init_secondary(void) { /* * QQQ: Only root PIC is aware of other CPUs ??? */ KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__)); //mtx_lock(&isrc_table_lock); PIC_INIT_SECONDARY(intr_irq_root_dev); //mtx_unlock(&isrc_table_lock); } #endif #ifdef DDB DB_SHOW_COMMAND(irqs, db_show_irqs) { u_int i, irqsum; u_long num; struct intr_irqsrc *isrc; for (irqsum = 0, i = 0; i < NIRQ; i++) { isrc = irq_sources[i]; if (isrc == NULL) continue; num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0; db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i, isrc->isrc_name, isrc->isrc_cpu.__bits[0], isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num); irqsum += num; } db_printf("irq total %u\n", irqsum); } #endif /* * Interrupt mapping table functions. * * Please, keep this part separately, it can be transformed to * extension of standard resources. */ struct intr_map_entry { device_t dev; intptr_t xref; struct intr_map_data *map_data; struct intr_irqsrc *isrc; /* XXX TODO DISCONECTED PICs */ /*int flags */ }; /* XXX Convert irq_map[] to dynamicaly expandable one. */ static struct intr_map_entry *irq_map[2 * NIRQ]; static int irq_map_count = nitems(irq_map); static int irq_map_first_free_idx; static struct mtx irq_map_lock; static struct intr_irqsrc * intr_map_get_isrc(u_int res_id) { struct intr_irqsrc *isrc; mtx_lock(&irq_map_lock); if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) { mtx_unlock(&irq_map_lock); return (NULL); } isrc = irq_map[res_id]->isrc; mtx_unlock(&irq_map_lock); return (isrc); } static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc) { mtx_lock(&irq_map_lock); if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) { mtx_unlock(&irq_map_lock); return; } irq_map[res_id]->isrc = isrc; mtx_unlock(&irq_map_lock); } /* * Get a copy of intr_map_entry data */ static struct intr_map_data * intr_map_get_map_data(u_int res_id) { struct intr_map_data *data; data = NULL; mtx_lock(&irq_map_lock); if (res_id >= irq_map_count || irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); data = irq_map[res_id]->map_data; mtx_unlock(&irq_map_lock); return (data); } /* * Get a copy of intr_map_entry data */ static void intr_map_copy_map_data(u_int res_id, device_t *map_dev, intptr_t *map_xref, struct intr_map_data **data) { size_t len; len = 0; mtx_lock(&irq_map_lock); if (res_id >= irq_map_count || irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); if (irq_map[res_id]->map_data != NULL) len = irq_map[res_id]->map_data->len; mtx_unlock(&irq_map_lock); if (len == 0) *data = NULL; else *data = malloc(len, M_INTRNG, M_WAITOK | M_ZERO); mtx_lock(&irq_map_lock); if (irq_map[res_id] == NULL) panic("Attempt to copy invalid resource id: %u\n", res_id); if (len != 0) { if (len != irq_map[res_id]->map_data->len) panic("Resource id: %u has changed.\n", res_id); memcpy(*data, irq_map[res_id]->map_data, len); } *map_dev = irq_map[res_id]->dev; *map_xref = irq_map[res_id]->xref; mtx_unlock(&irq_map_lock); } /* * Allocate and fill new entry in irq_map table. */ u_int intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data) { u_int i; struct intr_map_entry *entry; /* Prepare new entry first. */ entry = malloc(sizeof(*entry), M_INTRNG, M_WAITOK | M_ZERO); entry->dev = dev; entry->xref = xref; entry->map_data = data; entry->isrc = NULL; mtx_lock(&irq_map_lock); for (i = irq_map_first_free_idx; i < irq_map_count; i++) { if (irq_map[i] == NULL) { irq_map[i] = entry; irq_map_first_free_idx = i + 1; mtx_unlock(&irq_map_lock); return (i); } } mtx_unlock(&irq_map_lock); /* XXX Expand irq_map table */ panic("IRQ mapping table is full."); } /* * Remove and free mapping entry. */ void intr_unmap_irq(u_int res_id) { struct intr_map_entry *entry; mtx_lock(&irq_map_lock); if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL)) panic("Attempt to unmap invalid resource id: %u\n", res_id); entry = irq_map[res_id]; irq_map[res_id] = NULL; irq_map_first_free_idx = res_id; mtx_unlock(&irq_map_lock); intr_free_intr_map_data(entry->map_data); free(entry, M_INTRNG); } /* * Clone mapping entry. */ u_int intr_map_clone_irq(u_int old_res_id) { device_t map_dev; intptr_t map_xref; struct intr_map_data *data; intr_map_copy_map_data(old_res_id, &map_dev, &map_xref, &data); return (intr_map_irq(map_dev, map_xref, data)); } static void intr_map_init(void *dummy __unused) { mtx_init(&irq_map_lock, "intr map table", NULL, MTX_DEF); } SYSINIT(intr_map_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_map_init, NULL); Index: head/sys/kern/subr_syscall.c =================================================================== --- head/sys/kern/subr_syscall.c (revision 317060) +++ head/sys/kern/subr_syscall.c (revision 317061) @@ -1,268 +1,268 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2010 Konstantin Belousov * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include "opt_capsicum.h" #include "opt_ktrace.h" __FBSDID("$FreeBSD$"); #include #include #include #ifdef KTRACE #include #include #endif #include static inline int syscallenter(struct thread *td, struct syscall_args *sa) { struct proc *p; int error, traced; - PCPU_INC(cnt.v_syscall); + VM_CNT_INC(v_syscall); p = td->td_proc; td->td_pticks = 0; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); traced = (p->p_flag & P_TRACED) != 0; if (traced || td->td_dbgflags & TDB_USERWR) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_USERWR; if (traced) td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0], "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]); if (error == 0) { STOPEVENT(p, S_SCE, sa->narg); if (p->p_flag & P_TRACED) { PROC_LOCK(p); td->td_dbg_sc_code = sa->code; td->td_dbg_sc_narg = sa->narg; if (p->p_ptevents & PTRACE_SCE) ptracestop((td), SIGTRAP, NULL); PROC_UNLOCK(p); } if (td->td_dbgflags & TDB_USERWR) { /* * Reread syscall number and arguments if * debugger modified registers or memory. */ error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); PROC_LOCK(p); td->td_dbg_sc_code = sa->code; td->td_dbg_sc_narg = sa->narg; PROC_UNLOCK(p); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif if (error != 0) goto retval; } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if (IN_CAPABILITY_MODE(td) && !(sa->callp->sy_flags & SYF_CAPENABLED)) { error = ECAPMODE; goto retval; } #endif error = syscall_thread_enter(td, sa->callp); if (error != 0) goto retval; #ifdef KDTRACE_HOOKS /* Give the syscall:::entry DTrace probe a chance to fire. */ if (systrace_probe_func != NULL && sa->callp->sy_entry != 0) (*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0); #endif AUDIT_SYSCALL_ENTER(sa->code, td); error = (sa->callp->sy_call)(td, sa->args); AUDIT_SYSCALL_EXIT(error, td); /* Save the latest error return value. */ if ((td->td_pflags & TDP_NERRNO) == 0) td->td_errno = error; #ifdef KDTRACE_HOOKS /* Give the syscall:::return DTrace probe a chance to fire. */ if (systrace_probe_func != NULL && sa->callp->sy_return != 0) (*systrace_probe_func)(sa, SYSTRACE_RETURN, error ? -1 : td->td_retval[0]); #endif syscall_thread_exit(td, sa->callp); } retval: KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error, "retval0:%#lx", td->td_retval[0], "retval1:%#lx", td->td_retval[1]); if (traced) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); return (error); } static inline void syscallret(struct thread *td, int error, struct syscall_args *sa) { struct proc *p, *p2; ksiginfo_t ksi; int traced, error1; KASSERT((td->td_pflags & TDP_FORKING) == 0, ("fork() did not clear TDP_FORKING upon completion")); p = td->td_proc; if ((trap_enotcap || (p->p_flag2 & P2_TRAPCAP) != 0) && IN_CAPABILITY_MODE(td)) { error1 = (td->td_pflags & TDP_NERRNO) == 0 ? error : td->td_errno; if (error1 == ENOTCAPABLE || error1 == ECAPMODE) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_errno = error1; ksi.ksi_code = TRAP_CAP; trapsignal(td, &ksi); } } /* * Handle reschedule and other end-of-syscall issues */ userret(td, td->td_frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) { ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ? error : td->td_errno, td->td_retval[0]); } #endif td->td_pflags &= ~TDP_NERRNO; if (p->p_flag & P_TRACED) { traced = 1; PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; PROC_UNLOCK(p); } else traced = 0; /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, sa->code); if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) { PROC_LOCK(p); /* * If tracing the execed process, trap to the debugger * so that breakpoints can be set before the program * executes. If debugger requested tracing of syscall * returns, do it now too. */ if (traced && ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 || (p->p_ptevents & PTRACE_SCX) != 0)) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK); PROC_UNLOCK(p); } if (td->td_pflags & TDP_RFPPWAIT) { /* * Preserve synchronization semantics of vfork. If * waiting for child to exec or exit, fork set * P_PPWAIT on child, and there we sleep on our proc * (in case of exit). * * Do it after the ptracestop() above is finished, to * not block our debugger until child execs or exits * to finish vfork wait. */ td->td_pflags &= ~TDP_RFPPWAIT; p2 = td->td_rfppwait_p; again: PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) { PROC_LOCK(p); if (thread_suspend_check_needed()) { PROC_UNLOCK(p2); thread_suspend_check(0); PROC_UNLOCK(p); goto again; } else { PROC_UNLOCK(p); } cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz); } PROC_UNLOCK(p2); if (td->td_dbgflags & TDB_VFORK) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_VFORK) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~TDB_VFORK; PROC_UNLOCK(p); } } } Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 317060) +++ head/sys/kern/subr_trap.c (revision 317061) @@ -1,347 +1,347 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007 The FreeBSD Foundation * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #ifdef VIMAGE #include #endif #ifdef HWPMC_HOOKS #include #endif #include void (*softdep_ast_cleanup)(struct thread *); /* * Define the code needed before returning to user mode, for trap and * syscall. */ void userret(struct thread *td, struct trapframe *frame) { struct proc *p = td->td_proc; CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, td->td_name); KASSERT((p->p_flag & P_WEXIT) == 0, ("Exiting process returns to usermode")); #ifdef DIAGNOSTIC /* * Check that we called signotify() enough. For * multi-threaded processes, where signal distribution might * change due to other threads changing sigmask, the check is * racy and cannot be performed reliably. * If current process is vfork child, indicated by P_PPWAIT, then * issignal() ignores stops, so we block the check to avoid * classifying pending signals. */ if (p->p_numthreads == 1) { PROC_LOCK(p); thread_lock(td); if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed to set signal flags for ast p %p " "td %p fl %x", p, td, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif #ifdef KTRACE KTRUSERRET(td); #endif td_softdep_cleanup(td); MPASS(td->td_su == NULL); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio); /* * Let the scheduler adjust our priority etc. */ sched_userret(td); /* * Check for misbehavior. * * In case there is a callchain tracing ongoing because of * hwpmc(4), skip the scheduler pinning check. * hwpmc(4) subsystem, infact, will collect callchain informations * at ast() checkpoint, which is past userret(). */ WITNESS_WARN(WARN_PANIC, NULL, "userret: returning"); KASSERT(td->td_critnest == 0, ("userret: Returning in a critical section")); KASSERT(td->td_locks == 0, ("userret: Returning with %d locks held", td->td_locks)); KASSERT(td->td_rw_rlocks == 0, ("userret: Returning with %d rwlocks held in read mode", td->td_rw_rlocks)); KASSERT((td->td_pflags & TDP_NOFAULTING) == 0, ("userret: Returning with pagefaults disabled")); KASSERT(td->td_no_sleeping == 0, ("userret: Returning with sleep disabled")); KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0, ("userret: Returning with with pinned thread")); KASSERT(td->td_vp_reserv == 0, ("userret: Returning while holding vnode reservation")); KASSERT((td->td_flags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("userret: Returning with stop signals deferred")); KASSERT(td->td_su == NULL, ("userret: Returning with SU cleanup request not handled")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s", __func__, td, p->p_pid, td->td_name, curvnet, (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif #ifdef RACCT if (racct_enable && p->p_throttled != 0) { PROC_LOCK(p); while (p->p_throttled != 0) { msleep(p->p_racct, &p->p_mtx, 0, "racct", p->p_throttled < 0 ? 0 : p->p_throttled); if (p->p_throttled > 0) p->p_throttled = 0; } PROC_UNLOCK(p); } #endif } /* * Process an asynchronous software trap. * This is relatively easy. * This function will return with preemption disabled. */ void ast(struct trapframe *framep) { struct thread *td; struct proc *p; int flags; int sig; td = curthread; p = td->td_proc; CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); td->td_frame = framep; td->td_pticks = 0; /* * This updates the td_flag's for the checks below in one * "atomic" operation with turning off the astpending flag. * If another AST is triggered while we are handling the * AST's saved in flags, the astpending flag will be set and * ast() will be called again. */ thread_lock(td); flags = td->td_flags; td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK | TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND); thread_unlock(td); - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) { addupc_task(td, td->td_profil_addr, td->td_profil_ticks); td->td_profil_ticks = 0; td->td_pflags &= ~TDP_OWEUPC; } #ifdef HWPMC_HOOKS /* Handle Software PMC callchain capture. */ if (PMC_IS_PENDING_CALLCHAIN(td)) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep); #endif if (flags & TDF_ALRMPEND) { PROC_LOCK(p); kern_psignal(p, SIGVTALRM); PROC_UNLOCK(p); } if (flags & TDF_PROFPEND) { PROC_LOCK(p); kern_psignal(p, SIGPROF); PROC_UNLOCK(p); } #ifdef MAC if (flags & TDF_MACPEND) mac_thread_userret(td); #endif if (flags & TDF_NEEDRESCHED) { #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1, __func__); #endif thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL); thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1, __func__); #endif } #ifdef DIAGNOSTIC if (p->p_numthreads == 1 && (flags & TDF_NEEDSIGCHK) == 0) { PROC_LOCK(p); thread_lock(td); /* * Note that TDF_NEEDSIGCHK should be re-read from * td_flags, since signal might have been delivered * after we cleared td_flags above. This is one of * the reason for looping check for AST condition. * See comment in userret() about P_PPWAIT. */ if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed2 to set signal flags for ast p %p td %p " "fl %x %x", p, td, flags, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Check for signals. Unlocked reads of p_pendingcnt or * p_siglist might cause process-directed signal to be handled * later. */ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); } /* * We need to check to see if we have to exit or wait due to a * single threading requirement or some other STOP condition. */ if (flags & TDF_NEEDSUSPCHK) { PROC_LOCK(p); thread_suspend_check(0); PROC_UNLOCK(p); } if (td->td_pflags & TDP_OLDMASK) { td->td_pflags &= ~TDP_OLDMASK; kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0); } userret(td, framep); } const char * syscallname(struct proc *p, u_int code) { static const char unknown[] = "unknown"; struct sysentvec *sv; sv = p->p_sysent; if (sv->sv_syscallnames == NULL || code >= sv->sv_size) return (unknown); return (sv->sv_syscallnames[code]); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 317060) +++ head/sys/kern/vfs_bio.c (revision 317061) @@ -1,4998 +1,4998 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are expected to be potentially * transient - underlying media might work if tried again * after EIO, and memory might be available after an ENOMEM. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBCACHEBUF) panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, MAXBCACHEBUF); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); p->wire_count--; vm_page_free(p); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, pgsin); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/mips/include/counter.h =================================================================== --- head/sys/mips/include/counter.h (revision 317060) +++ head/sys/mips/include/counter.h (revision 317061) @@ -1,94 +1,96 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif +#define EARLY_COUNTER &((struct pcpu *)pcpu_space)->pc_early_dummy_counter + #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C /* XXXKIB non-atomic 64bit read on 32bit */ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB non-atomic 64bit store on 32bit, might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/mips/include/intr_machdep.h =================================================================== --- head/sys/mips/include/intr_machdep.h (revision 317060) +++ head/sys/mips/include/intr_machdep.h (revision 317061) @@ -1,76 +1,76 @@ /*- * Copyright (c) 2004 Juli Mallett * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_INTR_MACHDEP_H_ #define _MACHINE_INTR_MACHDEP_H_ #include #include #if defined(CPU_RMI) || defined(CPU_NLM) #define XLR_MAX_INTR 64 #else #define NHARD_IRQS 6 #define NSOFT_IRQS 2 #endif struct trapframe; void cpu_init_interrupts(void); void cpu_establish_hardintr(const char *, driver_filter_t *, driver_intr_t *, void *, int, int, void **); void cpu_establish_softintr(const char *, driver_filter_t *, void (*)(void*), void *, int, int, void **); void cpu_intr(struct trapframe *); /* * Allow a platform to override the default hard interrupt mask and unmask * functions. The 'arg' can be cast safely to an 'int' and holds the mips * hard interrupt number to mask or unmask. */ typedef void (*cpu_intr_mask_t)(void *arg); typedef void (*cpu_intr_unmask_t)(void *arg); void cpu_set_hardintr_mask_func(cpu_intr_mask_t func); void cpu_set_hardintr_unmask_func(cpu_intr_unmask_t func); /* * Opaque datatype that represents intr counter */ typedef unsigned long* mips_intrcnt_t; mips_intrcnt_t mips_intrcnt_create(const char *); void mips_intrcnt_setname(mips_intrcnt_t, const char *); static __inline void mips_intrcnt_inc(mips_intrcnt_t counter) { if (counter) atomic_add_long(counter, 1); - PCPU_INC(cnt.v_intr); + VM_CNT_INC(v_intr); } #endif /* !_MACHINE_INTR_MACHDEP_H_ */ Index: head/sys/mips/include/pcpu.h =================================================================== --- head/sys/mips/include/pcpu.h (revision 317060) +++ head/sys/mips/include/pcpu.h (revision 317061) @@ -1,92 +1,92 @@ /*- * Copyright (c) 1999 Luoqi Chen * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: src/sys/alpha/include/pcpu.h,v 1.15 2004/11/05 19:16:44 jhb * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #include #include #define PCPU_MD_COMMON_FIELDS \ pd_entry_t *pc_segbase; /* curthread segbase */ \ struct pmap *pc_curpmap; /* pmap of curthread */ \ u_int32_t pc_next_asid; /* next ASID to alloc */ \ u_int32_t pc_asid_generation; /* current ASID generation */ \ u_int pc_pending_ipis; /* IPIs pending to this CPU */ \ struct pcpu *pc_self; /* globally-uniqe self pointer */ #ifdef __mips_n64 #define PCPU_MD_MIPS64_FIELDS \ PCPU_MD_COMMON_FIELDS \ - char __pad[53] + char __pad[245] #else #define PCPU_MD_MIPS32_FIELDS \ PCPU_MD_COMMON_FIELDS \ - char __pad[189] + char __pad[125] #endif #ifdef __mips_n64 #define PCPU_MD_FIELDS PCPU_MD_MIPS64_FIELDS #else #define PCPU_MD_FIELDS PCPU_MD_MIPS32_FIELDS #endif #ifdef _KERNEL extern char pcpu_space[MAXCPU][PAGE_SIZE * 2]; #define PCPU_ADDR(cpu) (struct pcpu *)(pcpu_space[(cpu)]) extern struct pcpu *pcpup; #define PCPUP pcpup /* * Since we use a wired TLB entry to map the same VA to a different * physical page for each CPU, get_pcpu() must use the pc_self * field to obtain a globally-unique pointer. */ #define get_pcpu() (PCPUP->pc_self) #define PCPU_ADD(member, value) (PCPUP->pc_ ## member += (value)) #define PCPU_GET(member) (PCPUP->pc_ ## member) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&PCPUP->pc_ ## member) #define PCPU_SET(member,value) (PCPUP->pc_ ## member = (value)) #define PCPU_LAZY_INC(member) (++PCPUP->pc_ ## member) #ifdef SMP /* * Instantiate the wired TLB entry at PCPU_TLB_ENTRY to map 'pcpu' at 'pcpup'. */ void mips_pcpu_tlb_init(struct pcpu *pcpu); #endif #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/powerpc/include/counter.h =================================================================== --- head/sys/powerpc/include/counter.h (revision 317060) +++ head/sys/powerpc/include/counter.h (revision 317061) @@ -1,162 +1,164 @@ /*- * Copyright (c) 2012, 2013 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif +extern struct pcpu __pcpu[]; + +#define EARLY_COUNTER &__pcpu[0].pc_early_dummy_counter + #ifdef __powerpc64__ #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, i) counter_u64_add(c, i) - -extern struct pcpu __pcpu[MAXCPU]; static inline void counter_u64_add(counter_u64_t c, int64_t inc) { uint64_t ccpu, old; __asm __volatile("\n" "1:\n\t" "mfsprg %0, 0\n\t" "ldarx %1, %0, %2\n\t" "add %1, %1, %3\n\t" "stdcx. %1, %0, %2\n\t" "bne- 1b" : "=&b" (ccpu), "=&r" (old) : "r" ((char *)c - (char *)&__pcpu[0]), "r" (inc) : "cr0", "memory"); } #else /* !64bit */ #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C /* XXXKIB non-atomic 64bit read */ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB non-atomic 64bit store, might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* 64bit */ #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/powerpc/include/pcpu.h =================================================================== --- head/sys/powerpc/include/pcpu.h (revision 317060) +++ head/sys/powerpc/include/pcpu.h (revision 317061) @@ -1,180 +1,180 @@ /*- * Copyright (c) 1999 Luoqi Chen * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #include #include #include struct pmap; struct pvo_entry; #define CPUSAVE_LEN 9 #define PCPU_MD_COMMON_FIELDS \ int pc_inside_intr; \ struct pmap *pc_curpmap; /* current pmap */ \ struct thread *pc_fputhread; /* current fpu user */ \ struct thread *pc_vecthread; /* current vec user */ \ uintptr_t pc_hwref; \ uint32_t pc_pir; \ int pc_bsp; \ volatile int pc_awake; \ uint32_t pc_ipimask; \ register_t pc_tempsave[CPUSAVE_LEN]; \ register_t pc_disisave[CPUSAVE_LEN]; \ register_t pc_dbsave[CPUSAVE_LEN]; \ void *pc_restore; #define PCPU_MD_AIM32_FIELDS \ vm_offset_t pc_qmap_addr; \ struct pvo_entry *pc_qmap_pvo; \ struct mtx pc_qmap_lock; \ - /* char __pad[0] */ + char __pad[128] #define PCPU_MD_AIM64_FIELDS \ struct slb pc_slb[64]; \ struct slb **pc_userslb; \ register_t pc_slbsave[18]; \ uint8_t pc_slbstack[1024]; \ vm_offset_t pc_qmap_addr; \ struct pvo_entry *pc_qmap_pvo; \ struct mtx pc_qmap_lock; \ - char __pad[1121 - sizeof(struct mtx)] + char __pad[1345] #ifdef __powerpc64__ #define PCPU_MD_AIM_FIELDS PCPU_MD_AIM64_FIELDS #else #define PCPU_MD_AIM_FIELDS PCPU_MD_AIM32_FIELDS #endif #define BOOKE_CRITSAVE_LEN (CPUSAVE_LEN + 2) #define BOOKE_TLB_MAXNEST 3 #define BOOKE_TLB_SAVELEN 16 #define BOOKE_TLBSAVE_LEN (BOOKE_TLB_SAVELEN * BOOKE_TLB_MAXNEST) #ifdef __powerpc64__ -#define BOOKE_PCPU_PAD 773 +#define BOOKE_PCPU_PAD 901 #else -#define BOOKE_PCPU_PAD 173 +#define BOOKE_PCPU_PAD 429 #endif #define PCPU_MD_BOOKE_FIELDS \ register_t pc_booke_critsave[BOOKE_CRITSAVE_LEN]; \ register_t pc_booke_mchksave[CPUSAVE_LEN]; \ register_t pc_booke_tlbsave[BOOKE_TLBSAVE_LEN]; \ register_t pc_booke_tlb_level; \ vm_offset_t pc_qmap_addr; \ uintptr_t *pc_booke_tlb_lock; \ int pc_tid_next; \ char __pad[BOOKE_PCPU_PAD] /* Definitions for register offsets within the exception tmp save areas */ #define CPUSAVE_R27 0 /* where r27 gets saved */ #define CPUSAVE_R28 1 /* where r28 gets saved */ #define CPUSAVE_R29 2 /* where r29 gets saved */ #define CPUSAVE_R30 3 /* where r30 gets saved */ #define CPUSAVE_R31 4 /* where r31 gets saved */ #define CPUSAVE_AIM_DAR 5 /* where SPR_DAR gets saved */ #define CPUSAVE_AIM_DSISR 6 /* where SPR_DSISR gets saved */ #define CPUSAVE_BOOKE_DEAR 5 /* where SPR_DEAR gets saved */ #define CPUSAVE_BOOKE_ESR 6 /* where SPR_ESR gets saved */ #define CPUSAVE_SRR0 7 /* where SRR0 gets saved */ #define CPUSAVE_SRR1 8 /* where SRR1 gets saved */ #define BOOKE_CRITSAVE_SRR0 9 /* where real SRR0 gets saved (critical) */ #define BOOKE_CRITSAVE_SRR1 10 /* where real SRR0 gets saved (critical) */ /* Book-E TLBSAVE is more elaborate */ #define TLBSAVE_BOOKE_LR 0 #define TLBSAVE_BOOKE_CR 1 #define TLBSAVE_BOOKE_SRR0 2 #define TLBSAVE_BOOKE_SRR1 3 #define TLBSAVE_BOOKE_R20 4 #define TLBSAVE_BOOKE_R21 5 #define TLBSAVE_BOOKE_R22 6 #define TLBSAVE_BOOKE_R23 7 #define TLBSAVE_BOOKE_R24 8 #define TLBSAVE_BOOKE_R25 9 #define TLBSAVE_BOOKE_R26 10 #define TLBSAVE_BOOKE_R27 11 #define TLBSAVE_BOOKE_R28 12 #define TLBSAVE_BOOKE_R29 13 #define TLBSAVE_BOOKE_R30 14 #define TLBSAVE_BOOKE_R31 15 #ifdef AIM #define PCPU_MD_FIELDS \ PCPU_MD_COMMON_FIELDS \ PCPU_MD_AIM_FIELDS #endif #if defined(BOOKE) #define PCPU_MD_FIELDS \ PCPU_MD_COMMON_FIELDS \ PCPU_MD_BOOKE_FIELDS #endif /* * Catch-all for ports (e.g. lsof, used by gtop) */ #ifndef PCPU_MD_FIELDS #define PCPU_MD_FIELDS \ int pc_md_placeholder[32] #endif #ifdef _KERNEL #define pcpup (get_pcpu()) static __inline __pure2 struct thread * __curthread(void) { struct thread *td; #ifdef __powerpc64__ __asm __volatile("mr %0,13" : "=r"(td)); #else __asm __volatile("mr %0,2" : "=r"(td)); #endif return (td); } #define curthread (__curthread()) #define PCPU_GET(member) (pcpup->pc_ ## member) /* * XXX The implementation of this operation should be made atomic * with respect to preemption. */ #define PCPU_ADD(member, value) (pcpup->pc_ ## member += (value)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&pcpup->pc_ ## member) #define PCPU_SET(member,value) (pcpup->pc_ ## member = (value)) #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/powerpc/powerpc/trap.c =================================================================== --- head/sys/powerpc/powerpc/trap.c (revision 317060) +++ head/sys/powerpc/powerpc/trap.c (revision 317061) @@ -1,881 +1,881 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: trap.c,v 1.58 2002/03/04 04:07:35 dbj Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Below matches setjmp.S */ #define FAULTBUF_LR 21 #define FAULTBUF_R1 1 #define FAULTBUF_R2 2 #define FAULTBUF_CR 22 #define FAULTBUF_R14 3 #define MOREARGS(sp) ((caddr_t)((uintptr_t)(sp) + \ sizeof(struct callframe) - 3*sizeof(register_t))) /* more args go here */ static void trap_fatal(struct trapframe *frame); static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user); static int trap_pfault(struct trapframe *frame, int user); static int fix_unaligned(struct thread *td, struct trapframe *frame); static int handle_onfault(struct trapframe *frame); static void syscall(struct trapframe *frame); #if defined(__powerpc64__) && defined(AIM) void handle_kernel_slb_spill(int, register_t, register_t); static int handle_user_slb_spill(pmap_t pm, vm_offset_t addr); extern int n_slbs; #endif #ifdef KDB int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ #endif struct powerpc_exception { u_int vector; char *name; }; #ifdef KDTRACE_HOOKS #include int (*dtrace_invop_jump_addr)(struct trapframe *); #endif static struct powerpc_exception powerpc_exceptions[] = { { EXC_CRIT, "critical input" }, { EXC_RST, "system reset" }, { EXC_MCHK, "machine check" }, { EXC_DSI, "data storage interrupt" }, { EXC_DSE, "data segment exception" }, { EXC_ISI, "instruction storage interrupt" }, { EXC_ISE, "instruction segment exception" }, { EXC_EXI, "external interrupt" }, { EXC_ALI, "alignment" }, { EXC_PGM, "program" }, { EXC_FPU, "floating-point unavailable" }, { EXC_APU, "auxiliary proc unavailable" }, { EXC_DECR, "decrementer" }, { EXC_FIT, "fixed-interval timer" }, { EXC_WDOG, "watchdog timer" }, { EXC_SC, "system call" }, { EXC_TRC, "trace" }, { EXC_FPA, "floating-point assist" }, { EXC_DEBUG, "debug" }, { EXC_PERF, "performance monitoring" }, { EXC_VEC, "altivec unavailable" }, { EXC_VSX, "vsx unavailable" }, { EXC_ITMISS, "instruction tlb miss" }, { EXC_DLMISS, "data load tlb miss" }, { EXC_DSMISS, "data store tlb miss" }, { EXC_BPT, "instruction breakpoint" }, { EXC_SMI, "system management" }, { EXC_VECAST_G4, "altivec assist" }, { EXC_THRM, "thermal management" }, { EXC_RUNMODETRC, "run mode/trace" }, { EXC_LAST, NULL } }; static const char * trapname(u_int vector) { struct powerpc_exception *pe; for (pe = powerpc_exceptions; pe->vector != EXC_LAST; pe++) { if (pe->vector == vector) return (pe->name); } return ("unknown"); } void trap(struct trapframe *frame) { struct thread *td; struct proc *p; #ifdef KDTRACE_HOOKS uint32_t inst; #endif int sig, type, user; u_int ucode; ksiginfo_t ksi; - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); td = curthread; p = td->td_proc; type = ucode = frame->exc; sig = 0; user = frame->srr1 & PSL_PR; CTR3(KTR_TRAP, "trap: %s type=%s (%s)", td->td_name, trapname(type), user ? "user" : "kernel"); #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type) != 0) return; #endif if (user) { td->td_pticks = 0; td->td_frame = frame; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); /* User Mode Traps */ switch (type) { case EXC_RUNMODETRC: case EXC_TRC: frame->srr1 &= ~PSL_SE; sig = SIGTRAP; ucode = TRAP_TRACE; break; #if defined(__powerpc64__) && defined(AIM) case EXC_ISE: case EXC_DSE: if (handle_user_slb_spill(&p->p_vmspace->vm_pmap, (type == EXC_ISE) ? frame->srr0 : frame->dar) != 0){ sig = SIGSEGV; ucode = SEGV_MAPERR; } break; #endif case EXC_DSI: case EXC_ISI: sig = trap_pfault(frame, 1); if (sig == SIGSEGV) ucode = SEGV_MAPERR; break; case EXC_SC: syscall(frame); break; case EXC_FPU: KASSERT((td->td_pcb->pcb_flags & PCB_FPU) != PCB_FPU, ("FPU already enabled for thread")); enable_fpu(td); break; case EXC_VEC: KASSERT((td->td_pcb->pcb_flags & PCB_VEC) != PCB_VEC, ("Altivec already enabled for thread")); enable_vec(td); break; case EXC_VSX: KASSERT((td->td_pcb->pcb_flags & PCB_VSX) != PCB_VSX, ("VSX already enabled for thread")); if (!(td->td_pcb->pcb_flags & PCB_VEC)) enable_vec(td); if (!(td->td_pcb->pcb_flags & PCB_FPU)) save_fpu(td); td->td_pcb->pcb_flags |= PCB_VSX; enable_fpu(td); break; case EXC_VECAST_E: case EXC_VECAST_G4: case EXC_VECAST_G5: /* * We get a VPU assist exception for IEEE mode * vector operations on denormalized floats. * Emulating this is a giant pain, so for now, * just switch off IEEE mode and treat them as * zero. */ save_vec(td); td->td_pcb->pcb_vec.vscr |= ALTIVEC_VSCR_NJ; enable_vec(td); break; case EXC_ALI: if (fix_unaligned(td, frame) != 0) { sig = SIGBUS; ucode = BUS_ADRALN; } else frame->srr0 += 4; break; case EXC_DEBUG: /* Single stepping */ mtspr(SPR_DBSR, mfspr(SPR_DBSR)); frame->srr1 &= ~PSL_DE; frame->cpu.booke.dbcr0 &= ~(DBCR0_IDM | DBCR0_IC); sig = SIGTRAP; ucode = TRAP_TRACE; break; case EXC_PGM: /* Identify the trap reason */ #ifdef AIM if (frame->srr1 & EXC_PGM_TRAP) { #else if (frame->cpu.booke.esr & ESR_PTR) { #endif #ifdef KDTRACE_HOOKS inst = fuword32((const void *)frame->srr0); if (inst == 0x0FFFDDDD && dtrace_pid_probe_ptr != NULL) { struct reg regs; fill_regs(td, ®s); (*dtrace_pid_probe_ptr)(®s); break; } #endif sig = SIGTRAP; ucode = TRAP_BRKPT; } else { sig = ppc_instr_emulate(frame, td->td_pcb); if (sig == SIGILL) { if (frame->srr1 & EXC_PGM_PRIV) ucode = ILL_PRVOPC; else if (frame->srr1 & EXC_PGM_ILLEGAL) ucode = ILL_ILLOPC; } else if (sig == SIGFPE) ucode = FPE_FLTINV; /* Punt for now, invalid operation. */ } break; case EXC_MCHK: /* * Note that this may not be recoverable for the user * process, depending on the type of machine check, * but it at least prevents the kernel from dying. */ sig = SIGBUS; ucode = BUS_OBJERR; break; default: trap_fatal(frame); } } else { /* Kernel Mode Traps */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case EXC_PGM: #ifdef KDTRACE_HOOKS #ifdef AIM if (frame->srr1 & EXC_PGM_TRAP) { #else if (frame->cpu.booke.esr & ESR_PTR) { #endif if (*(uint32_t *)frame->srr0 == EXC_DTRACE) { if (dtrace_invop_jump_addr != NULL) { dtrace_invop_jump_addr(frame); return; } } } #endif #ifdef KDB if (db_trap_glue(frame)) return; #endif break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: if ((frame->dar & SEGMENT_MASK) == USER_ADDR) { __asm __volatile ("slbmte %0, %1" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); return; } break; #endif case EXC_DSI: if (trap_pfault(frame, 0) == 0) return; break; case EXC_MCHK: if (handle_onfault(frame)) return; break; default: break; } trap_fatal(frame); } if (sig != 0) { if (p->p_sysent->sv_transtrap != NULL) sig = (p->p_sysent->sv_transtrap)(sig, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int) ucode; /* XXX, not POSIX */ /* ksi.ksi_addr = ? */ ksi.ksi_trapno = type; trapsignal(td, &ksi); } userret(td, frame); } static void trap_fatal(struct trapframe *frame) { printtrap(frame->exc, frame, 1, (frame->srr1 & PSL_PR)); #ifdef KDB if ((debugger_on_panic || kdb_active) && kdb_trap(frame->exc, 0, frame)) return; #endif panic("%s trap", trapname(frame->exc)); } static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { uint16_t ver; #ifdef BOOKE vm_paddr_t pa; #endif printf("\n"); printf("%s %s trap:\n", isfatal ? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" exception = 0x%x (%s)\n", vector, trapname(vector)); switch (vector) { case EXC_DSE: case EXC_DSI: case EXC_DTMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar); #ifdef AIM printf(" dsisr = 0x%lx\n", (u_long)frame->cpu.aim.dsisr); #endif break; case EXC_ISE: case EXC_ISI: case EXC_ITMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->srr0); break; case EXC_MCHK: ver = mfpvr() >> 16; #if defined(AIM) if (MPC745X_P(ver)) printf(" msssr0 = 0x%lx\n", (u_long)mfspr(SPR_MSSSR0)); #elif defined(BOOKE) pa = mfspr(SPR_MCARU); pa = (pa << 32) | (u_register_t)mfspr(SPR_MCAR); printf(" mcsr = 0x%lx\n", (u_long)mfspr(SPR_MCSR)); printf(" mcar = 0x%jx\n", (uintmax_t)pa); #endif break; } #ifdef BOOKE printf(" esr = 0x%" PRIxPTR "\n", frame->cpu.booke.esr); #endif printf(" srr0 = 0x%" PRIxPTR "\n", frame->srr0); printf(" srr1 = 0x%lx\n", (u_long)frame->srr1); printf(" lr = 0x%" PRIxPTR "\n", frame->lr); printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_name); printf("\n"); } /* * Handles a fatal fault when we have onfault state to recover. Returns * non-zero if there was onfault recovery state available. */ static int handle_onfault(struct trapframe *frame) { struct thread *td; jmp_buf *fb; td = curthread; fb = td->td_pcb->pcb_onfault; if (fb != NULL) { frame->srr0 = (*fb)->_jb[FAULTBUF_LR]; frame->fixreg[1] = (*fb)->_jb[FAULTBUF_R1]; frame->fixreg[2] = (*fb)->_jb[FAULTBUF_R2]; frame->fixreg[3] = 1; frame->cr = (*fb)->_jb[FAULTBUF_CR]; bcopy(&(*fb)->_jb[FAULTBUF_R14], &frame->fixreg[14], 18 * sizeof(register_t)); td->td_pcb->pcb_onfault = NULL; /* Returns twice, not thrice */ return (1); } return (0); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; caddr_t params; size_t argsz; int error, n, i; p = td->td_proc; frame = td->td_frame; sa->code = frame->fixreg[0]; params = (caddr_t)(frame->fixreg + FIRSTARG); n = NARGREG; if (sa->code == SYS_syscall) { /* * code is first argument, * followed by actual args. */ sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, * so as to maintain quad alignment * for the rest of the args. */ if (SV_PROC_FLAG(p, SV_ILP32)) { params += sizeof(register_t); sa->code = *(register_t *) params; params += sizeof(register_t); n -= 2; } else { sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (SV_PROC_FLAG(p, SV_ILP32)) { argsz = sizeof(uint32_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i] & 0xffffffff; } else { argsz = sizeof(uint64_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i]; } if (sa->narg > n) error = copyin(MOREARGS(frame->fixreg[1]), sa->args + n, (sa->narg - n) * argsz); else error = 0; #ifdef __powerpc64__ if (SV_PROC_FLAG(p, SV_ILP32) && sa->narg > n) { /* Expand the size of arguments copied from the stack */ for (i = sa->narg; i >= n; i--) sa->args[i] = ((uint32_t *)(&sa->args[n]))[i-n]; } #endif if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->fixreg[FIRSTARG + 1]; } return (error); } #include "../../kern/subr_syscall.c" void syscall(struct trapframe *frame) { struct thread *td; struct syscall_args sa; int error; td = curthread; td->td_frame = frame; #if defined(__powerpc64__) && defined(AIM) /* * Speculatively restore last user SLB segment, which we know is * invalid already, since we are likely to do copyin()/copyout(). */ __asm __volatile ("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #endif error = syscallenter(td, &sa); syscallret(td, error, &sa); } #if defined(__powerpc64__) && defined(AIM) /* Handle kernel SLB faults -- runs in real mode, all seat belts off */ void handle_kernel_slb_spill(int type, register_t dar, register_t srr0) { struct slb *slbcache; uint64_t slbe, slbv; uint64_t esid, addr; int i; addr = (type == EXC_ISE) ? srr0 : dar; slbcache = PCPU_GET(slb); esid = (uintptr_t)addr >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; /* See if the hardware flushed this somehow (can happen in LPARs) */ for (i = 0; i < n_slbs; i++) if (slbcache[i].slbe == (slbe | (uint64_t)i)) return; /* Not in the map, needs to actually be added */ slbv = kernel_va_to_slbv(addr); if (slbcache[USER_SLB_SLOT].slbe == 0) { for (i = 0; i < n_slbs; i++) { if (i == USER_SLB_SLOT) continue; if (!(slbcache[i].slbe & SLBE_VALID)) goto fillkernslb; } if (i == n_slbs) slbcache[USER_SLB_SLOT].slbe = 1; } /* Sacrifice a random SLB entry that is not the user entry */ i = mftb() % n_slbs; if (i == USER_SLB_SLOT) i = (i+1) % n_slbs; fillkernslb: /* Write new entry */ slbcache[i].slbv = slbv; slbcache[i].slbe = slbe | (uint64_t)i; /* Trap handler will restore from cache on exit */ } static int handle_user_slb_spill(pmap_t pm, vm_offset_t addr) { struct slb *user_entry; uint64_t esid; int i; esid = (uintptr_t)addr >> ADDR_SR_SHFT; PMAP_LOCK(pm); user_entry = user_va_to_slb_entry(pm, addr); if (user_entry == NULL) { /* allocate_vsid auto-spills it */ (void)allocate_user_vsid(pm, esid, 0); } else { /* * Check that another CPU has not already mapped this. * XXX: Per-thread SLB caches would be better. */ for (i = 0; i < pm->pm_slb_len; i++) if (pm->pm_slb[i] == user_entry) break; if (i == pm->pm_slb_len) slb_insert_user(pm, user_entry); } PMAP_UNLOCK(pm); return (0); } #endif static int trap_pfault(struct trapframe *frame, int user) { vm_offset_t eva, va; struct thread *td; struct proc *p; vm_map_t map; vm_prot_t ftype; int rv; #ifdef AIM register_t user_sr; #endif td = curthread; p = td->td_proc; if (frame->exc == EXC_ISI) { eva = frame->srr0; ftype = VM_PROT_EXECUTE; if (frame->srr1 & SRR1_ISI_PFAULT) ftype |= VM_PROT_READ; } else { eva = frame->dar; #ifdef BOOKE if (frame->cpu.booke.esr & ESR_ST) #else if (frame->cpu.aim.dsisr & DSISR_STORE) #endif ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; } else { #ifdef BOOKE if (eva < VM_MAXUSER_ADDRESS) { #else if ((eva >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { #endif map = &p->p_vmspace->vm_map; #ifdef AIM user_sr = td->td_pcb->pcb_cpu.aim.usr_segm; eva &= ADDR_PIDX | ADDR_POFF; eva |= user_sr << ADDR_SR_SHFT; #endif } else { map = kernel_map; } } va = trunc_page(eva); /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ if (rv == KERN_SUCCESS) return (0); if (!user && handle_onfault(frame)) return (0); return (SIGSEGV); } /* * For now, this only deals with the particular unaligned access case * that gcc tends to generate. Eventually it should handle all of the * possibilities that can happen on a 32-bit PowerPC in big-endian mode. */ static int fix_unaligned(struct thread *td, struct trapframe *frame) { struct thread *fputhread; #ifdef __SPE__ uint32_t inst; #endif int indicator, reg; double *fpr; #ifdef __SPE__ indicator = (frame->cpu.booke.esr & (ESR_ST|ESR_SPE)); if (indicator & ESR_SPE) { if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_SPE_REG(inst); fpr = (double *)td->td_pcb->pcb_vec.vr[reg]; fputhread = PCPU_GET(vecthread); /* Juggle the SPE to ensure that we've initialized * the registers, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_vec(fputhread); enable_vec(td); } save_vec(td); if (!(indicator & ESR_ST)) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); frame->fixreg[reg] = td->td_pcb->pcb_vec.vr[reg][1]; enable_vec(td); } else { td->td_pcb->pcb_vec.vr[reg][1] = frame->fixreg[reg]; if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); } #else indicator = EXC_ALI_OPCODE_INDICATOR(frame->cpu.aim.dsisr); switch (indicator) { case EXC_ALI_LFD: case EXC_ALI_STFD: reg = EXC_ALI_RST(frame->cpu.aim.dsisr); fpr = &td->td_pcb->pcb_fpu.fpr[reg].fpr; fputhread = PCPU_GET(fputhread); /* Juggle the FPU to ensure that we've initialized * the FPRs, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_fpu(fputhread); enable_fpu(td); } save_fpu(td); if (indicator == EXC_ALI_LFD) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); enable_fpu(td); } else { if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); break; } #endif return (-1); } #ifdef KDB int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC #ifdef AIM || (frame->exc == EXC_PGM && (frame->srr1 & EXC_PGM_TRAP)) #else || (frame->exc == EXC_DEBUG) || (frame->cpu.booke.esr & ESR_PTR) #endif || frame->exc == EXC_BPT || frame->exc == EXC_DSI)) { int type = frame->exc; /* Ignore DTrace traps. */ if (*(uint32_t *)frame->srr0 == EXC_DTRACE) return (0); #ifdef AIM if (type == EXC_PGM && (frame->srr1 & EXC_PGM_TRAP)) { #else if (type == EXC_DEBUG || (frame->cpu.booke.esr & ESR_PTR)) { #endif type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } #endif Index: head/sys/sparc64/include/counter.h =================================================================== --- head/sys/sparc64/include/counter.h (revision 317060) +++ head/sys/sparc64/include/counter.h (revision 317061) @@ -1,93 +1,96 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif +extern struct pcpu dummy_pcpu[]; +#define EARLY_COUNTER &dummy_pcpu[0].pc_early_dummy_counter + #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: head/sys/sparc64/include/pcpu.h =================================================================== --- head/sys/sparc64/include/pcpu.h (revision 317060) +++ head/sys/sparc64/include/pcpu.h (revision 317061) @@ -1,101 +1,101 @@ /*- * Copyright (c) 1999 Luoqi Chen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/include/globaldata.h,v 1.27 2001/04/27 * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #include #include #include #include #define ALT_STACK_SIZE 128 struct pmap; /* * Inside the kernel, the globally reserved register g7 is used to * point at the globaldata structure. */ #define PCPU_MD_FIELDS \ struct cacheinfo pc_cache; \ struct intr_request pc_irpool[IR_FREE]; \ struct intr_request *pc_irhead; \ struct intr_request **pc_irtail; \ struct intr_request *pc_irfree; \ struct pmap *pc_pmap; \ vm_offset_t pc_addr; \ vm_offset_t pc_qmap_addr; \ u_long pc_tickref; \ u_long pc_tickadj; \ u_long pc_tickincrement; \ u_int pc_clock; \ u_int pc_impl; \ u_int pc_mid; \ u_int pc_node; \ u_int pc_tlb_ctx; \ u_int pc_tlb_ctx_max; \ u_int pc_tlb_ctx_min; \ - char __pad[397] + char __pad[653] #ifdef _KERNEL extern void *dpcpu0; struct pcb; struct pcpu; register struct pcb *curpcb __asm__(__XSTRING(PCB_REG)); register struct pcpu *pcpup __asm__(__XSTRING(PCPU_REG)); #define get_pcpu() (pcpup) #define PCPU_GET(member) (pcpup->pc_ ## member) static __inline __pure2 struct thread * __curthread(void) { struct thread *td; __asm("ldx [%" __XSTRING(PCPU_REG) "], %0" : "=r" (td)); return (td); } #define curthread (__curthread()) /* * XXX The implementation of this operation should be made atomic * with respect to preemption. */ #define PCPU_ADD(member, value) (pcpup->pc_ ## member += (value)) #define PCPU_INC(member) PCPU_ADD(member, 1) #define PCPU_PTR(member) (&pcpup->pc_ ## member) #define PCPU_SET(member,value) (pcpup->pc_ ## member = (value)) #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: head/sys/sparc64/sparc64/exception.S =================================================================== --- head/sys/sparc64/sparc64/exception.S (revision 317060) +++ head/sys/sparc64/sparc64/exception.S (revision 317061) @@ -1,3069 +1,3065 @@ /*- * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BSDI $Id: locore.s,v 1.36.2.15 1999/08/23 22:34:41 cp Exp $ */ /*- * Copyright (c) 2001 Jake Burkholder. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "assym.s" #define TSB_ASI 0x0 #define TSB_KERNEL 0x0 #define TSB_KERNEL_MASK 0x0 #define TSB_KERNEL_PHYS 0x0 #define TSB_KERNEL_PHYS_END 0x0 #define TSB_QUAD_LDD 0x0 .register %g2,#ignore .register %g3,#ignore .register %g6,#ignore .register %g7,#ignore /* * Atomically set a bit in a TTE. */ #define TTE_SET_BIT(r1, r2, r3, bit, a, asi) \ add r1, TTE_DATA, r1 ; \ LD(x, a) [r1] asi, r2 ; \ 9: or r2, bit, r3 ; \ CAS(x, a) [r1] asi, r2, r3 ; \ cmp r2, r3 ; \ bne,pn %xcc, 9b ; \ mov r3, r2 #define TTE_SET_REF(r1, r2, r3, a, asi) TTE_SET_BIT(r1, r2, r3, TD_REF, a, asi) #define TTE_SET_W(r1, r2, r3, a, asi) TTE_SET_BIT(r1, r2, r3, TD_W, a, asi) /* * Macros for spilling and filling live windows. * * NOTE: These macros use exactly 16 instructions, and it is assumed that the * handler will not use more than 24 instructions total, to leave room for * resume vectors which occupy the last 8 instructions. */ #define SPILL(storer, base, size, asi) \ storer %l0, [base + (0 * size)] asi ; \ storer %l1, [base + (1 * size)] asi ; \ storer %l2, [base + (2 * size)] asi ; \ storer %l3, [base + (3 * size)] asi ; \ storer %l4, [base + (4 * size)] asi ; \ storer %l5, [base + (5 * size)] asi ; \ storer %l6, [base + (6 * size)] asi ; \ storer %l7, [base + (7 * size)] asi ; \ storer %i0, [base + (8 * size)] asi ; \ storer %i1, [base + (9 * size)] asi ; \ storer %i2, [base + (10 * size)] asi ; \ storer %i3, [base + (11 * size)] asi ; \ storer %i4, [base + (12 * size)] asi ; \ storer %i5, [base + (13 * size)] asi ; \ storer %i6, [base + (14 * size)] asi ; \ storer %i7, [base + (15 * size)] asi #define FILL(loader, base, size, asi) \ loader [base + (0 * size)] asi, %l0 ; \ loader [base + (1 * size)] asi, %l1 ; \ loader [base + (2 * size)] asi, %l2 ; \ loader [base + (3 * size)] asi, %l3 ; \ loader [base + (4 * size)] asi, %l4 ; \ loader [base + (5 * size)] asi, %l5 ; \ loader [base + (6 * size)] asi, %l6 ; \ loader [base + (7 * size)] asi, %l7 ; \ loader [base + (8 * size)] asi, %i0 ; \ loader [base + (9 * size)] asi, %i1 ; \ loader [base + (10 * size)] asi, %i2 ; \ loader [base + (11 * size)] asi, %i3 ; \ loader [base + (12 * size)] asi, %i4 ; \ loader [base + (13 * size)] asi, %i5 ; \ loader [base + (14 * size)] asi, %i6 ; \ loader [base + (15 * size)] asi, %i7 #define ERRATUM50(reg) mov reg, reg #define KSTACK_SLOP 1024 /* * Sanity check the kernel stack and bail out if it's wrong. * XXX: doesn't handle being on the panic stack. */ #define KSTACK_CHECK \ dec 16, ASP_REG ; \ stx %g1, [ASP_REG + 0] ; \ stx %g2, [ASP_REG + 8] ; \ add %sp, SPOFF, %g1 ; \ andcc %g1, (1 << PTR_SHIFT) - 1, %g0 ; \ bnz,a %xcc, tl1_kstack_fault ; \ inc 16, ASP_REG ; \ ldx [PCPU(CURTHREAD)], %g2 ; \ ldx [%g2 + TD_KSTACK], %g2 ; \ add %g2, KSTACK_SLOP, %g2 ; \ subcc %g1, %g2, %g1 ; \ ble,a %xcc, tl1_kstack_fault ; \ inc 16, ASP_REG ; \ set KSTACK_PAGES * PAGE_SIZE, %g2 ; \ cmp %g1, %g2 ; \ bgt,a %xcc, tl1_kstack_fault ; \ inc 16, ASP_REG ; \ ldx [ASP_REG + 8], %g2 ; \ ldx [ASP_REG + 0], %g1 ; \ inc 16, ASP_REG .globl tl_text_begin tl_text_begin: nop ENTRY(tl1_kstack_fault) rdpr %tl, %g1 1: cmp %g1, 2 be,a 2f nop #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl1_kstack_fault: tl=%#lx tpc=%#lx tnpc=%#lx" , %g2, %g3, %g4, 7, 8, 9) rdpr %tl, %g3 stx %g3, [%g2 + KTR_PARM1] rdpr %tpc, %g3 stx %g3, [%g2 + KTR_PARM1] rdpr %tnpc, %g3 stx %g3, [%g2 + KTR_PARM1] 9: #endif sub %g1, 1, %g1 wrpr %g1, 0, %tl ba,a %xcc, 1b nop 2: #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl1_kstack_fault: sp=%#lx ks=%#lx cr=%#lx cs=%#lx ow=%#lx ws=%#lx" , %g1, %g2, %g3, 7, 8, 9) add %sp, SPOFF, %g2 stx %g2, [%g1 + KTR_PARM1] ldx [PCPU(CURTHREAD)], %g2 ldx [%g2 + TD_KSTACK], %g2 stx %g2, [%g1 + KTR_PARM2] rdpr %canrestore, %g2 stx %g2, [%g1 + KTR_PARM3] rdpr %cansave, %g2 stx %g2, [%g1 + KTR_PARM4] rdpr %otherwin, %g2 stx %g2, [%g1 + KTR_PARM5] rdpr %wstate, %g2 stx %g2, [%g1 + KTR_PARM6] 9: #endif wrpr %g0, 0, %canrestore wrpr %g0, 6, %cansave wrpr %g0, 0, %otherwin wrpr %g0, WSTATE_KERNEL, %wstate sub ASP_REG, SPOFF + CCFSZ, %sp clr %fp set trap, %o2 ba %xcc, tl1_trap mov T_KSTACK_FAULT | T_KERNEL, %o0 END(tl1_kstack_fault) /* * Magic to resume from a spill or fill trap. If we get an alignment or an * MMU fault during a spill or a fill, this macro will detect the fault and * resume at a set instruction offset in the trap handler. * * To check if the previous trap was a spill/fill we convert the trapped pc * to a trap type and verify that it is in the range of spill/fill vectors. * The spill/fill vectors are types 0x80-0xff and 0x280-0x2ff, masking off the * tl bit allows us to detect both ranges with one test. * * This is: * 0x80 <= (((%tpc - %tba) >> 5) & ~0x200) < 0x100 * * To calculate the new pc we take advantage of the xor feature of wrpr. * Forcing all the low bits of the trapped pc on we can produce any offset * into the spill/fill vector. The size of a spill/fill trap vector is 0x80. * * 0x7f ^ 0x1f == 0x60 * 0x1f == (0x80 - 0x60) - 1 * * Which are the offset and xor value used to resume from alignment faults. */ /* * Determine if we have trapped inside of a spill/fill vector, and if so resume * at a fixed instruction offset in the trap vector. Must be called on * alternate globals. */ #define RESUME_SPILLFILL_MAGIC(stxa_g0_sfsr, xor) \ dec 16, ASP_REG ; \ stx %g1, [ASP_REG + 0] ; \ stx %g2, [ASP_REG + 8] ; \ rdpr %tpc, %g1 ; \ ERRATUM50(%g1) ; \ rdpr %tba, %g2 ; \ sub %g1, %g2, %g2 ; \ srlx %g2, 5, %g2 ; \ andn %g2, 0x200, %g2 ; \ cmp %g2, 0x80 ; \ blu,pt %xcc, 9f ; \ cmp %g2, 0x100 ; \ bgeu,pt %xcc, 9f ; \ or %g1, 0x7f, %g1 ; \ wrpr %g1, xor, %tnpc ; \ stxa_g0_sfsr ; \ ldx [ASP_REG + 8], %g2 ; \ ldx [ASP_REG + 0], %g1 ; \ inc 16, ASP_REG ; \ done ; \ 9: ldx [ASP_REG + 8], %g2 ; \ ldx [ASP_REG + 0], %g1 ; \ inc 16, ASP_REG /* * For certain faults we need to clear the SFSR MMU register before returning. */ #define RSF_CLR_SFSR \ wr %g0, ASI_DMMU, %asi ; \ stxa %g0, [%g0 + AA_DMMU_SFSR] %asi #define RSF_XOR(off) ((0x80 - off) - 1) /* * Instruction offsets in spill and fill trap handlers for handling certain * nested traps, and corresponding xor constants for wrpr. */ #define RSF_OFF_ALIGN 0x60 #define RSF_OFF_MMU 0x70 #define RESUME_SPILLFILL_ALIGN \ RESUME_SPILLFILL_MAGIC(RSF_CLR_SFSR, RSF_XOR(RSF_OFF_ALIGN)) #define RESUME_SPILLFILL_MMU \ RESUME_SPILLFILL_MAGIC(EMPTY, RSF_XOR(RSF_OFF_MMU)) #define RESUME_SPILLFILL_MMU_CLR_SFSR \ RESUME_SPILLFILL_MAGIC(RSF_CLR_SFSR, RSF_XOR(RSF_OFF_MMU)) /* * Constant to add to %tnpc when taking a fill trap just before returning to * user mode. */ #define RSF_FILL_INC tl0_ret_fill_end - tl0_ret_fill /* * Generate a T_SPILL or T_FILL trap if the window operation fails. */ #define RSF_TRAP(type) \ ba %xcc, tl0_sftrap ; \ mov type, %g2 ; \ .align 16 /* * Game over if the window operation fails. */ #define RSF_FATAL(type) \ ba %xcc, rsf_fatal ; \ mov type, %g2 ; \ .align 16 /* * Magic to resume from a failed fill a few instructions after the corrsponding * restore. This is used on return from the kernel to usermode. */ #define RSF_FILL_MAGIC \ rdpr %tnpc, %g1 ; \ add %g1, RSF_FILL_INC, %g1 ; \ wrpr %g1, 0, %tnpc ; \ done ; \ .align 16 /* * Spill to the pcb if a spill to the user stack in kernel mode fails. */ #define RSF_SPILL_TOPCB \ ba,a %xcc, tl1_spill_topcb ; \ nop ; \ .align 16 ENTRY(rsf_fatal) #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "rsf_fatal: bad window trap tt=%#lx type=%#lx" , %g1, %g3, %g4, 7, 8, 9) rdpr %tt, %g3 stx %g3, [%g1 + KTR_PARM1] stx %g2, [%g1 + KTR_PARM2] 9: #endif KSTACK_CHECK sir END(rsf_fatal) .data _ALIGN_DATA .globl intrnames, sintrnames intrnames: .space (IV_MAX + PIL_MAX) * (MAXCOMLEN + 1) sintrnames: .quad (IV_MAX + PIL_MAX) * (MAXCOMLEN + 1) .globl intrcnt, sintrcnt intrcnt: .space (IV_MAX + PIL_MAX) * 8 sintrcnt: .quad (IV_MAX + PIL_MAX) * 8 .text /* * Trap table and associated macros * * Due to its size a trap table is an inherently hard thing to represent in * code in a clean way. There are approximately 1024 vectors, of 8 or 32 * instructions each, many of which are identical. The way that this is * laid out is the instructions (8 or 32) for the actual trap vector appear * as an AS macro. In general this code branches to tl0_trap or tl1_trap, * but if not supporting code can be placed just after the definition of the * macro. The macros are then instantiated in a different section (.trap), * which is setup to be placed by the linker at the beginning of .text, and the * code around the macros is moved to the end of trap table. In this way the * code that must be sequential in memory can be split up, and located near * its supporting code so that it is easier to follow. */ /* * Clean window traps occur when %cleanwin is zero to ensure that data * is not leaked between address spaces in registers. */ .macro clean_window clr %o0 clr %o1 clr %o2 clr %o3 clr %o4 clr %o5 clr %o6 clr %o7 clr %l0 clr %l1 clr %l2 clr %l3 clr %l4 clr %l5 clr %l6 rdpr %cleanwin, %l7 inc %l7 wrpr %l7, 0, %cleanwin clr %l7 retry .align 128 .endm /* * Stack fixups for entry from user mode. We are still running on the * user stack, and with its live registers, so we must save soon. We * are on alternate globals so we do have some registers. Set the * transitional window state, and do the save. If this traps we * attempt to spill a window to the user stack. If this fails, we * spill the window to the pcb and continue. Spilling to the pcb * must not fail. * * NOTE: Must be called with alternate globals and clobbers %g1. */ .macro tl0_split rdpr %wstate, %g1 wrpr %g1, WSTATE_TRANSITION, %wstate save .endm .macro tl0_setup type tl0_split clr %o1 set trap, %o2 ba %xcc, tl0_utrap mov \type, %o0 .endm /* * Generic trap type. Call trap() with the specified type. */ .macro tl0_gen type tl0_setup \type .align 32 .endm /* * This is used to suck up the massive swaths of reserved trap types. * Generates count "reserved" trap vectors. */ .macro tl0_reserved count .rept \count tl0_gen T_RESERVED .endr .endm .macro tl1_split rdpr %wstate, %g1 wrpr %g1, WSTATE_NESTED, %wstate save %sp, -(CCFSZ + TF_SIZEOF), %sp .endm .macro tl1_setup type tl1_split clr %o1 set trap, %o2 ba %xcc, tl1_trap mov \type | T_KERNEL, %o0 .endm .macro tl1_gen type tl1_setup \type .align 32 .endm .macro tl1_reserved count .rept \count tl1_gen T_RESERVED .endr .endm .macro tl0_insn_excptn wrpr %g0, PSTATE_ALT, %pstate wr %g0, ASI_IMMU, %asi rdpr %tpc, %g3 ldxa [%g0 + AA_IMMU_SFSR] %asi, %g4 /* * XXX in theory, a store to AA_IMMU_SFSR must be immediately * followed by a DONE, FLUSH or RETRY for USIII. In practice, * this triggers a RED state exception though. */ stxa %g0, [%g0 + AA_IMMU_SFSR] %asi membar #Sync ba %xcc, tl0_sfsr_trap mov T_INSTRUCTION_EXCEPTION, %g2 .align 32 .endm .macro tl0_data_excptn wrpr %g0, PSTATE_ALT, %pstate wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync ba %xcc, tl0_sfsr_trap mov T_DATA_EXCEPTION, %g2 .align 32 .endm .macro tl0_align wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync ba %xcc, tl0_sfsr_trap mov T_MEM_ADDRESS_NOT_ALIGNED, %g2 .align 32 .endm ENTRY(tl0_sfsr_trap) tl0_split clr %o1 set trap, %o2 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl0_utrap mov %g2, %o0 END(tl0_sfsr_trap) .macro tl0_intr level, mask tl0_split set \mask, %o1 ba %xcc, tl0_intr mov \level, %o0 .align 32 .endm #define INTR(level, traplvl) \ tl ## traplvl ## _intr level, 1 << level #define TICK(traplvl) \ tl ## traplvl ## _intr PIL_TICK, 0x10001 #define INTR_LEVEL(tl) \ INTR(1, tl) ; \ INTR(2, tl) ; \ INTR(3, tl) ; \ INTR(4, tl) ; \ INTR(5, tl) ; \ INTR(6, tl) ; \ INTR(7, tl) ; \ INTR(8, tl) ; \ INTR(9, tl) ; \ INTR(10, tl) ; \ INTR(11, tl) ; \ INTR(12, tl) ; \ INTR(13, tl) ; \ TICK(tl) ; \ INTR(15, tl) ; .macro tl0_intr_level INTR_LEVEL(0) .endm .macro intr_vector ldxa [%g0] ASI_INTR_RECEIVE, %g1 andcc %g1, IRSR_BUSY, %g0 bnz,a,pt %xcc, intr_vector nop ba,a,pt %xcc, intr_vector_stray nop .align 32 .endm .macro tl0_immu_miss /* * Load the context and the virtual page number from the tag access * register. We ignore the context. */ wr %g0, ASI_IMMU, %asi ldxa [%g0 + AA_IMMU_TAR] %asi, %g1 /* * Initialize the page size walker. */ mov TS_MIN, %g2 /* * Loop over all supported page sizes. */ /* * Compute the page shift for the page size we are currently looking * for. */ 1: add %g2, %g2, %g3 add %g3, %g2, %g3 add %g3, PAGE_SHIFT, %g3 /* * Extract the virtual page number from the contents of the tag * access register. */ srlx %g1, %g3, %g3 /* * Compute the TTE bucket address. */ ldxa [%g0 + AA_IMMU_TSB] %asi, %g5 and %g3, TSB_BUCKET_MASK, %g4 sllx %g4, TSB_BUCKET_SHIFT + TTE_SHIFT, %g4 add %g4, %g5, %g4 /* * Compute the TTE tag target. */ sllx %g3, TV_SIZE_BITS, %g3 or %g3, %g2, %g3 /* * Loop over the TTEs in this bucket. */ /* * Load the TTE. Note that this instruction may fault, clobbering * the contents of the tag access register, %g5, %g6, and %g7. We * do not use %g5, and %g6 and %g7 are not used until this instruction * completes successfully. */ 2: ldda [%g4] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and executable and that the TTE tags match. */ brgez,pn %g7, 3f andcc %g7, TD_EXEC, %g0 bz,pn %xcc, 3f cmp %g3, %g6 bne,pn %xcc, 3f EMPTY /* * We matched a TTE, load the TLB. */ /* * Set the reference bit, if it's currently clear. */ andcc %g7, TD_REF, %g0 bz,a,pn %xcc, tl0_immu_miss_set_ref nop /* * Load the TTE tag and data into the TLB and retry the instruction. */ stxa %g1, [%g0 + AA_IMMU_TAR] %asi stxa %g7, [%g0] ASI_ITLB_DATA_IN_REG retry /* * Advance to the next TTE in this bucket, and check the low bits * of the bucket pointer to see if we've finished the bucket. */ 3: add %g4, 1 << TTE_SHIFT, %g4 andcc %g4, (1 << (TSB_BUCKET_SHIFT + TTE_SHIFT)) - 1, %g0 bnz,pt %xcc, 2b EMPTY /* * See if we just checked the largest page size, and advance to the * next one if not. */ cmp %g2, TS_MAX bne,pt %xcc, 1b add %g2, 1, %g2 /* * Not in user TSB, call C code. */ ba,a %xcc, tl0_immu_miss_trap .align 128 .endm ENTRY(tl0_immu_miss_set_ref) /* * Set the reference bit. */ TTE_SET_REF(%g4, %g2, %g3, a, ASI_N) /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g2, 1f nop /* * Load the TTE tag and data into the TLB and retry the instruction. */ stxa %g1, [%g0 + AA_IMMU_TAR] %asi stxa %g2, [%g0] ASI_ITLB_DATA_IN_REG 1: retry END(tl0_immu_miss_set_ref) ENTRY(tl0_immu_miss_trap) /* * Put back the contents of the tag access register, in case we * faulted. */ sethi %hi(KERNBASE), %g2 stxa %g1, [%g0 + AA_IMMU_TAR] %asi flush %g2 /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate /* * Reload the tag access register. */ ldxa [%g0 + AA_IMMU_TAR] %asi, %g2 /* * Save the tag access register, and call common trap code. */ tl0_split clr %o1 set trap, %o2 mov %g2, %o3 ba %xcc, tl0_utrap mov T_INSTRUCTION_MISS, %o0 END(tl0_immu_miss_trap) .macro tl0_dmmu_miss /* * Load the context and the virtual page number from the tag access * register. We ignore the context. */ wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_TAR] %asi, %g1 /* * Initialize the page size walker. */ tl1_dmmu_miss_user: mov TS_MIN, %g2 /* * Loop over all supported page sizes. */ /* * Compute the page shift for the page size we are currently looking * for. */ 1: add %g2, %g2, %g3 add %g3, %g2, %g3 add %g3, PAGE_SHIFT, %g3 /* * Extract the virtual page number from the contents of the tag * access register. */ srlx %g1, %g3, %g3 /* * Compute the TTE bucket address. */ ldxa [%g0 + AA_DMMU_TSB] %asi, %g5 and %g3, TSB_BUCKET_MASK, %g4 sllx %g4, TSB_BUCKET_SHIFT + TTE_SHIFT, %g4 add %g4, %g5, %g4 /* * Compute the TTE tag target. */ sllx %g3, TV_SIZE_BITS, %g3 or %g3, %g2, %g3 /* * Loop over the TTEs in this bucket. */ /* * Load the TTE. Note that this instruction may fault, clobbering * the contents of the tag access register, %g5, %g6, and %g7. We * do not use %g5, and %g6 and %g7 are not used until this instruction * completes successfully. */ 2: ldda [%g4] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and that the virtual page numbers match. */ brgez,pn %g7, 3f cmp %g3, %g6 bne,pn %xcc, 3f EMPTY /* * We matched a TTE, load the TLB. */ /* * Set the reference bit, if it's currently clear. */ andcc %g7, TD_REF, %g0 bz,a,pn %xcc, tl0_dmmu_miss_set_ref nop /* * Load the TTE tag and data into the TLB and retry the instruction. */ stxa %g1, [%g0 + AA_DMMU_TAR] %asi stxa %g7, [%g0] ASI_DTLB_DATA_IN_REG retry /* * Advance to the next TTE in this bucket, and check the low bits * of the bucket pointer to see if we've finished the bucket. */ 3: add %g4, 1 << TTE_SHIFT, %g4 andcc %g4, (1 << (TSB_BUCKET_SHIFT + TTE_SHIFT)) - 1, %g0 bnz,pt %xcc, 2b EMPTY /* * See if we just checked the largest page size, and advance to the * next one if not. */ cmp %g2, TS_MAX bne,pt %xcc, 1b add %g2, 1, %g2 /* * Not in user TSB, call C code. */ ba,a %xcc, tl0_dmmu_miss_trap .align 128 .endm ENTRY(tl0_dmmu_miss_set_ref) /* * Set the reference bit. */ TTE_SET_REF(%g4, %g2, %g3, a, ASI_N) /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g2, 1f nop /* * Load the TTE tag and data into the TLB and retry the instruction. */ stxa %g1, [%g0 + AA_DMMU_TAR] %asi stxa %g2, [%g0] ASI_DTLB_DATA_IN_REG 1: retry END(tl0_dmmu_miss_set_ref) ENTRY(tl0_dmmu_miss_trap) /* * Put back the contents of the tag access register, in case we * faulted. */ stxa %g1, [%g0 + AA_DMMU_TAR] %asi membar #Sync /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate /* * Check if we actually came from the kernel. */ rdpr %tl, %g1 cmp %g1, 1 bgt,a,pn %xcc, 1f nop /* * Reload the tag access register. */ ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 /* * Save the tag access register and call common trap code. */ tl0_split clr %o1 set trap, %o2 mov %g2, %o3 ba %xcc, tl0_utrap mov T_DATA_MISS, %o0 /* * Handle faults during window spill/fill. */ 1: RESUME_SPILLFILL_MMU /* * Reload the tag access register. */ ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 tl1_split clr %o1 set trap, %o2 mov %g2, %o3 ba %xcc, tl1_trap mov T_DATA_MISS | T_KERNEL, %o0 END(tl0_dmmu_miss_trap) .macro tl0_dmmu_prot ba,a %xcc, tl0_dmmu_prot_1 nop .align 128 .endm ENTRY(tl0_dmmu_prot_1) /* * Load the context and the virtual page number from the tag access * register. We ignore the context. */ wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_TAR] %asi, %g1 /* * Initialize the page size walker. */ tl1_dmmu_prot_user: mov TS_MIN, %g2 /* * Loop over all supported page sizes. */ /* * Compute the page shift for the page size we are currently looking * for. */ 1: add %g2, %g2, %g3 add %g3, %g2, %g3 add %g3, PAGE_SHIFT, %g3 /* * Extract the virtual page number from the contents of the tag * access register. */ srlx %g1, %g3, %g3 /* * Compute the TTE bucket address. */ ldxa [%g0 + AA_DMMU_TSB] %asi, %g5 and %g3, TSB_BUCKET_MASK, %g4 sllx %g4, TSB_BUCKET_SHIFT + TTE_SHIFT, %g4 add %g4, %g5, %g4 /* * Compute the TTE tag target. */ sllx %g3, TV_SIZE_BITS, %g3 or %g3, %g2, %g3 /* * Loop over the TTEs in this bucket. */ /* * Load the TTE. Note that this instruction may fault, clobbering * the contents of the tag access register, %g5, %g6, and %g7. We * do not use %g5, and %g6 and %g7 are not used until this instruction * completes successfully. */ 2: ldda [%g4] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and writable and that the virtual page * numbers match. */ brgez,pn %g7, 4f andcc %g7, TD_SW, %g0 bz,pn %xcc, 4f cmp %g3, %g6 bne,pn %xcc, 4f nop /* * Set the hardware write bit. */ TTE_SET_W(%g4, %g2, %g3, a, ASI_N) /* * Delete the old TLB entry and clear the SFSR. */ srlx %g1, PAGE_SHIFT, %g3 sllx %g3, PAGE_SHIFT, %g3 stxa %g0, [%g3] ASI_DMMU_DEMAP stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g2, 3f or %g2, TD_W, %g2 /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g1, [%g0 + AA_DMMU_TAR] %asi stxa %g2, [%g0] ASI_DTLB_DATA_IN_REG 3: retry /* * Check the low bits to see if we've finished the bucket. */ 4: add %g4, 1 << TTE_SHIFT, %g4 andcc %g4, (1 << (TSB_BUCKET_SHIFT + TTE_SHIFT)) - 1, %g0 bnz,pt %xcc, 2b EMPTY /* * See if we just checked the largest page size, and advance to the * next one if not. */ cmp %g2, TS_MAX bne,pt %xcc, 1b add %g2, 1, %g2 /* * Not in user TSB, call C code. */ ba,a %xcc, tl0_dmmu_prot_trap nop END(tl0_dmmu_prot_1) ENTRY(tl0_dmmu_prot_trap) /* * Put back the contents of the tag access register, in case we * faulted. */ stxa %g1, [%g0 + AA_DMMU_TAR] %asi membar #Sync /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate /* * Check if we actually came from the kernel. */ rdpr %tl, %g1 cmp %g1, 1 bgt,a,pn %xcc, 1f nop /* * Load the SFAR, SFSR and TAR. */ ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync /* * Save the MMU registers and call common trap code. */ tl0_split clr %o1 set trap, %o2 mov %g2, %o3 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl0_utrap mov T_DATA_PROTECTION, %o0 /* * Handle faults during window spill/fill. */ 1: RESUME_SPILLFILL_MMU_CLR_SFSR /* * Load the SFAR, SFSR and TAR. Clear the SFSR. */ ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync tl1_split clr %o1 set trap, %o2 mov %g2, %o3 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl1_trap mov T_DATA_PROTECTION | T_KERNEL, %o0 END(tl0_dmmu_prot_trap) .macro tl0_spill_0_n wr %g0, ASI_AIUP, %asi SPILL(stxa, %sp + SPOFF, 8, %asi) saved retry .align 32 RSF_TRAP(T_SPILL) RSF_TRAP(T_SPILL) .endm .macro tl0_spill_1_n wr %g0, ASI_AIUP, %asi SPILL(stwa, %sp, 4, %asi) saved retry .align 32 RSF_TRAP(T_SPILL) RSF_TRAP(T_SPILL) .endm .macro tl0_fill_0_n wr %g0, ASI_AIUP, %asi FILL(ldxa, %sp + SPOFF, 8, %asi) restored retry .align 32 RSF_TRAP(T_FILL) RSF_TRAP(T_FILL) .endm .macro tl0_fill_1_n wr %g0, ASI_AIUP, %asi FILL(lduwa, %sp, 4, %asi) restored retry .align 32 RSF_TRAP(T_FILL) RSF_TRAP(T_FILL) .endm ENTRY(tl0_sftrap) rdpr %tstate, %g1 and %g1, TSTATE_CWP_MASK, %g1 wrpr %g1, 0, %cwp tl0_split clr %o1 set trap, %o2 ba %xcc, tl0_trap mov %g2, %o0 END(tl0_sftrap) .macro tl0_spill_bad count .rept \count sir .align 128 .endr .endm .macro tl0_fill_bad count .rept \count sir .align 128 .endr .endm .macro tl0_syscall tl0_split clr %o1 set syscall, %o2 ba %xcc, tl0_trap mov T_SYSCALL, %o0 .align 32 .endm .macro tl0_fp_restore ba,a %xcc, tl0_fp_restore nop .align 32 .endm ENTRY(tl0_fp_restore) ldx [PCB_REG + PCB_FLAGS], %g1 andn %g1, PCB_FEF, %g1 stx %g1, [PCB_REG + PCB_FLAGS] wr %g0, FPRS_FEF, %fprs wr %g0, ASI_BLK_S, %asi ldda [PCB_REG + PCB_UFP + (0 * 64)] %asi, %f0 ldda [PCB_REG + PCB_UFP + (1 * 64)] %asi, %f16 ldda [PCB_REG + PCB_UFP + (2 * 64)] %asi, %f32 ldda [PCB_REG + PCB_UFP + (3 * 64)] %asi, %f48 membar #Sync done END(tl0_fp_restore) .macro tl1_insn_excptn wrpr %g0, PSTATE_ALT, %pstate wr %g0, ASI_IMMU, %asi rdpr %tpc, %g3 ldxa [%g0 + AA_IMMU_SFSR] %asi, %g4 /* * XXX in theory, a store to AA_IMMU_SFSR must be immediately * followed by a DONE, FLUSH or RETRY for USIII. In practice, * this triggers a RED state exception though. */ stxa %g0, [%g0 + AA_IMMU_SFSR] %asi membar #Sync ba %xcc, tl1_insn_exceptn_trap mov T_INSTRUCTION_EXCEPTION | T_KERNEL, %g2 .align 32 .endm ENTRY(tl1_insn_exceptn_trap) tl1_split clr %o1 set trap, %o2 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl1_trap mov %g2, %o0 END(tl1_insn_exceptn_trap) .macro tl1_fp_disabled ba,a %xcc, tl1_fp_disabled_1 nop .align 32 .endm ENTRY(tl1_fp_disabled_1) rdpr %tpc, %g1 set fpu_fault_begin, %g2 sub %g1, %g2, %g1 cmp %g1, fpu_fault_size bgeu,a,pn %xcc, 1f nop wr %g0, FPRS_FEF, %fprs wr %g0, ASI_BLK_S, %asi ldda [PCB_REG + PCB_KFP + (0 * 64)] %asi, %f0 ldda [PCB_REG + PCB_KFP + (1 * 64)] %asi, %f16 ldda [PCB_REG + PCB_KFP + (2 * 64)] %asi, %f32 ldda [PCB_REG + PCB_KFP + (3 * 64)] %asi, %f48 membar #Sync retry 1: tl1_split clr %o1 set trap, %o2 ba %xcc, tl1_trap mov T_FP_DISABLED | T_KERNEL, %o0 END(tl1_fp_disabled_1) .macro tl1_data_excptn wrpr %g0, PSTATE_ALT, %pstate ba,a %xcc, tl1_data_excptn_trap nop .align 32 .endm ENTRY(tl1_data_excptn_trap) RESUME_SPILLFILL_MMU_CLR_SFSR ba %xcc, tl1_sfsr_trap mov T_DATA_EXCEPTION | T_KERNEL, %g2 END(tl1_data_excptn_trap) .macro tl1_align wrpr %g0, PSTATE_ALT, %pstate ba,a %xcc, tl1_align_trap nop .align 32 .endm ENTRY(tl1_align_trap) RESUME_SPILLFILL_ALIGN ba %xcc, tl1_sfsr_trap mov T_MEM_ADDRESS_NOT_ALIGNED | T_KERNEL, %g2 END(tl1_align_trap) ENTRY(tl1_sfsr_trap) wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync tl1_split clr %o1 set trap, %o2 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl1_trap mov %g2, %o0 END(tl1_sfsr_trap) .macro tl1_intr level, mask tl1_split set \mask, %o1 ba %xcc, tl1_intr mov \level, %o0 .align 32 .endm .macro tl1_intr_level INTR_LEVEL(1) .endm .macro tl1_immu_miss /* * Load the context and the virtual page number from the tag access * register. We ignore the context. */ wr %g0, ASI_IMMU, %asi ldxa [%g0 + AA_IMMU_TAR] %asi, %g5 /* * Compute the address of the TTE. The TSB mask and address of the * TSB are patched at startup. */ .globl tl1_immu_miss_patch_tsb_1 tl1_immu_miss_patch_tsb_1: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_immu_miss_patch_tsb_mask_1 tl1_immu_miss_patch_tsb_mask_1: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 srlx %g5, TAR_VPN_SHIFT, %g5 and %g5, %g6, %g6 sllx %g6, TTE_SHIFT, %g6 add %g6, %g7, %g6 /* * Load the TTE. */ .globl tl1_immu_miss_patch_quad_ldd_1 tl1_immu_miss_patch_quad_ldd_1: ldda [%g6] TSB_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and executable and that the virtual page * numbers match. */ brgez,pn %g7, tl1_immu_miss_trap andcc %g7, TD_EXEC, %g0 bz,pn %xcc, tl1_immu_miss_trap srlx %g6, TV_SIZE_BITS, %g6 cmp %g5, %g6 bne,pn %xcc, tl1_immu_miss_trap EMPTY /* * Set the reference bit if it's currently clear. */ andcc %g7, TD_REF, %g0 bz,a,pn %xcc, tl1_immu_miss_set_ref nop /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g7, [%g0] ASI_ITLB_DATA_IN_REG retry .align 128 .endm ENTRY(tl1_immu_miss_set_ref) /* * Recompute the TTE address, which we clobbered loading the TTE. * The TSB mask and address of the TSB are patched at startup. */ .globl tl1_immu_miss_patch_tsb_2 tl1_immu_miss_patch_tsb_2: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_immu_miss_patch_tsb_mask_2 tl1_immu_miss_patch_tsb_mask_2: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 and %g5, %g6, %g5 sllx %g5, TTE_SHIFT, %g5 add %g5, %g7, %g5 /* * Set the reference bit. */ .globl tl1_immu_miss_patch_asi_1 tl1_immu_miss_patch_asi_1: wr %g0, TSB_ASI, %asi TTE_SET_REF(%g5, %g6, %g7, a, %asi) /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g6, 1f nop /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g6, [%g0] ASI_ITLB_DATA_IN_REG 1: retry END(tl1_immu_miss_set_ref) ENTRY(tl1_immu_miss_trap) /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate ldxa [%g0 + AA_IMMU_TAR] %asi, %g2 tl1_split clr %o1 set trap, %o2 mov %g2, %o3 ba %xcc, tl1_trap mov T_INSTRUCTION_MISS | T_KERNEL, %o0 END(tl1_immu_miss_trap) .macro tl1_dmmu_miss /* * Load the context and the virtual page number from the tag access * register. */ wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_TAR] %asi, %g5 /* * Extract the context from the contents of the tag access register. * If it's non-zero this is a fault on a user address. Note that the * faulting address is passed in %g1. */ sllx %g5, 64 - TAR_VPN_SHIFT, %g6 brnz,a,pn %g6, tl1_dmmu_miss_user mov %g5, %g1 /* * Check for the direct mapped physical region. These addresses have * the high bit set so they are negative. */ brlz,pn %g5, tl1_dmmu_miss_direct EMPTY /* * Compute the address of the TTE. The TSB mask and address of the * TSB are patched at startup. */ .globl tl1_dmmu_miss_patch_tsb_1 tl1_dmmu_miss_patch_tsb_1: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_dmmu_miss_patch_tsb_mask_1 tl1_dmmu_miss_patch_tsb_mask_1: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 srlx %g5, TAR_VPN_SHIFT, %g5 and %g5, %g6, %g6 sllx %g6, TTE_SHIFT, %g6 add %g6, %g7, %g6 /* * Load the TTE. */ .globl tl1_dmmu_miss_patch_quad_ldd_1 tl1_dmmu_miss_patch_quad_ldd_1: ldda [%g6] TSB_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and that the virtual page numbers match. */ brgez,pn %g7, tl1_dmmu_miss_trap srlx %g6, TV_SIZE_BITS, %g6 cmp %g5, %g6 bne,pn %xcc, tl1_dmmu_miss_trap EMPTY /* * Set the reference bit if it's currently clear. */ andcc %g7, TD_REF, %g0 bz,a,pt %xcc, tl1_dmmu_miss_set_ref nop /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g7, [%g0] ASI_DTLB_DATA_IN_REG retry .align 128 .endm ENTRY(tl1_dmmu_miss_set_ref) /* * Recompute the TTE address, which we clobbered loading the TTE. * The TSB mask and address of the TSB are patched at startup. */ .globl tl1_dmmu_miss_patch_tsb_mask_2 tl1_dmmu_miss_patch_tsb_2: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_dmmu_miss_patch_tsb_2 tl1_dmmu_miss_patch_tsb_mask_2: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 and %g5, %g6, %g5 sllx %g5, TTE_SHIFT, %g5 add %g5, %g7, %g5 /* * Set the reference bit. */ .globl tl1_dmmu_miss_patch_asi_1 tl1_dmmu_miss_patch_asi_1: wr %g0, TSB_ASI, %asi TTE_SET_REF(%g5, %g6, %g7, a, %asi) /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g6, 1f nop /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g6, [%g0] ASI_DTLB_DATA_IN_REG 1: retry END(tl1_dmmu_miss_set_ref) ENTRY(tl1_dmmu_miss_trap) /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 KSTACK_CHECK tl1_split clr %o1 set trap, %o2 mov %g2, %o3 ba %xcc, tl1_trap mov T_DATA_MISS | T_KERNEL, %o0 END(tl1_dmmu_miss_trap) ENTRY(tl1_dmmu_miss_direct) /* * Mask off the high bits of the virtual address to get the physical * address, and or in the TTE bits. The virtual address bits that * correspond to the TTE valid and page size bits are left set, so * they don't have to be included in the TTE bits below. We know they * are set because the virtual address is in the upper va hole. * NB: if we are taking advantage of the ASI_ATOMIC_QUAD_LDD_PHYS * and we get a miss on the directly accessed kernel TSB we must not * set TD_CV in order to access it uniformly bypassing the D$. */ setx TLB_DIRECT_ADDRESS_MASK, %g7, %g4 and %g5, %g4, %g4 setx TLB_DIRECT_TO_TTE_MASK, %g7, %g6 and %g5, %g6, %g5 .globl tl1_dmmu_miss_direct_patch_tsb_phys_1 tl1_dmmu_miss_direct_patch_tsb_phys_1: sethi %uhi(TSB_KERNEL_PHYS), %g3 or %g3, %ulo(TSB_KERNEL_PHYS), %g3 sllx %g3, 32, %g3 sethi %hi(TSB_KERNEL_PHYS), %g3 or %g7, %g3, %g7 cmp %g4, %g7 bl,pt %xcc, 1f or %g5, TD_CP | TD_W, %g5 .globl tl1_dmmu_miss_direct_patch_tsb_phys_end_1 tl1_dmmu_miss_direct_patch_tsb_phys_end_1: sethi %uhi(TSB_KERNEL_PHYS_END), %g3 or %g3, %ulo(TSB_KERNEL_PHYS_END), %g3 sllx %g3, 32, %g3 sethi %hi(TSB_KERNEL_PHYS_END), %g7 or %g7, %g3, %g7 cmp %g4, %g7 bg,a,pt %xcc, 1f nop ba,pt %xcc, 2f nop 1: or %g5, TD_CV, %g5 /* * Load the TTE data into the TLB and retry the instruction. */ 2: stxa %g5, [%g0] ASI_DTLB_DATA_IN_REG retry END(tl1_dmmu_miss_direct) .macro tl1_dmmu_prot ba,a %xcc, tl1_dmmu_prot_1 nop .align 128 .endm ENTRY(tl1_dmmu_prot_1) /* * Load the context and the virtual page number from the tag access * register. */ wr %g0, ASI_DMMU, %asi ldxa [%g0 + AA_DMMU_TAR] %asi, %g5 /* * Extract the context from the contents of the tag access register. * If it's non-zero this is a fault on a user address. Note that the * faulting address is passed in %g1. */ sllx %g5, 64 - TAR_VPN_SHIFT, %g6 brnz,a,pn %g6, tl1_dmmu_prot_user mov %g5, %g1 /* * Compute the address of the TTE. The TSB mask and address of the * TSB are patched at startup. */ .globl tl1_dmmu_prot_patch_tsb_1 tl1_dmmu_prot_patch_tsb_1: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_dmmu_prot_patch_tsb_mask_1 tl1_dmmu_prot_patch_tsb_mask_1: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 srlx %g5, TAR_VPN_SHIFT, %g5 and %g5, %g6, %g6 sllx %g6, TTE_SHIFT, %g6 add %g6, %g7, %g6 /* * Load the TTE. */ .globl tl1_dmmu_prot_patch_quad_ldd_1 tl1_dmmu_prot_patch_quad_ldd_1: ldda [%g6] TSB_QUAD_LDD, %g6 /*, %g7 */ /* * Check that it's valid and writeable and that the virtual page * numbers match. */ brgez,pn %g7, tl1_dmmu_prot_trap andcc %g7, TD_SW, %g0 bz,pn %xcc, tl1_dmmu_prot_trap srlx %g6, TV_SIZE_BITS, %g6 cmp %g5, %g6 bne,pn %xcc, tl1_dmmu_prot_trap EMPTY /* * Delete the old TLB entry and clear the SFSR. */ sllx %g5, TAR_VPN_SHIFT, %g6 or %g6, TLB_DEMAP_NUCLEUS, %g6 stxa %g0, [%g6] ASI_DMMU_DEMAP stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync /* * Recompute the TTE address, which we clobbered loading the TTE. * The TSB mask and address of the TSB are patched at startup. */ .globl tl1_dmmu_prot_patch_tsb_2 tl1_dmmu_prot_patch_tsb_2: sethi %uhi(TSB_KERNEL), %g6 or %g6, %ulo(TSB_KERNEL), %g6 sllx %g6, 32, %g6 sethi %hi(TSB_KERNEL), %g7 or %g7, %g6, %g7 .globl tl1_dmmu_prot_patch_tsb_mask_2 tl1_dmmu_prot_patch_tsb_mask_2: sethi %hi(TSB_KERNEL_MASK), %g6 or %g6, %lo(TSB_KERNEL_MASK), %g6 and %g5, %g6, %g5 sllx %g5, TTE_SHIFT, %g5 add %g5, %g7, %g5 /* * Set the hardware write bit. */ .globl tl1_dmmu_prot_patch_asi_1 tl1_dmmu_prot_patch_asi_1: wr %g0, TSB_ASI, %asi TTE_SET_W(%g5, %g6, %g7, a, %asi) /* * May have become invalid during casxa, in which case start over. */ brgez,pn %g6, 1f or %g6, TD_W, %g6 /* * Load the TTE data into the TLB and retry the instruction. */ stxa %g6, [%g0] ASI_DTLB_DATA_IN_REG 1: retry END(tl1_dmmu_prot_1) ENTRY(tl1_dmmu_prot_trap) /* * Switch to alternate globals. */ wrpr %g0, PSTATE_ALT, %pstate /* * Load the SFAR, SFSR and TAR. Clear the SFSR. */ ldxa [%g0 + AA_DMMU_TAR] %asi, %g2 ldxa [%g0 + AA_DMMU_SFAR] %asi, %g3 ldxa [%g0 + AA_DMMU_SFSR] %asi, %g4 stxa %g0, [%g0 + AA_DMMU_SFSR] %asi membar #Sync tl1_split clr %o1 set trap, %o2 mov %g2, %o3 mov %g3, %o4 mov %g4, %o5 ba %xcc, tl1_trap mov T_DATA_PROTECTION | T_KERNEL, %o0 END(tl1_dmmu_prot_trap) .macro tl1_spill_0_n SPILL(stx, %sp + SPOFF, 8, EMPTY) saved retry .align 32 RSF_FATAL(T_SPILL) RSF_FATAL(T_SPILL) .endm .macro tl1_spill_2_n wr %g0, ASI_AIUP, %asi SPILL(stxa, %sp + SPOFF, 8, %asi) saved retry .align 32 RSF_SPILL_TOPCB RSF_SPILL_TOPCB .endm .macro tl1_spill_3_n wr %g0, ASI_AIUP, %asi SPILL(stwa, %sp, 4, %asi) saved retry .align 32 RSF_SPILL_TOPCB RSF_SPILL_TOPCB .endm .macro tl1_spill_7_n btst 1, %sp bnz,a,pn %xcc, tl1_spill_0_n nop srl %sp, 0, %sp SPILL(stw, %sp, 4, EMPTY) saved retry .align 32 RSF_FATAL(T_SPILL) RSF_FATAL(T_SPILL) .endm .macro tl1_spill_0_o wr %g0, ASI_AIUP, %asi SPILL(stxa, %sp + SPOFF, 8, %asi) saved retry .align 32 RSF_SPILL_TOPCB RSF_SPILL_TOPCB .endm .macro tl1_spill_1_o wr %g0, ASI_AIUP, %asi SPILL(stwa, %sp, 4, %asi) saved retry .align 32 RSF_SPILL_TOPCB RSF_SPILL_TOPCB .endm .macro tl1_spill_2_o RSF_SPILL_TOPCB .align 128 .endm .macro tl1_fill_0_n FILL(ldx, %sp + SPOFF, 8, EMPTY) restored retry .align 32 RSF_FATAL(T_FILL) RSF_FATAL(T_FILL) .endm .macro tl1_fill_2_n wr %g0, ASI_AIUP, %asi FILL(ldxa, %sp + SPOFF, 8, %asi) restored retry .align 32 RSF_FILL_MAGIC RSF_FILL_MAGIC .endm .macro tl1_fill_3_n wr %g0, ASI_AIUP, %asi FILL(lduwa, %sp, 4, %asi) restored retry .align 32 RSF_FILL_MAGIC RSF_FILL_MAGIC .endm .macro tl1_fill_7_n btst 1, %sp bnz,a,pt %xcc, tl1_fill_0_n nop srl %sp, 0, %sp FILL(lduw, %sp, 4, EMPTY) restored retry .align 32 RSF_FATAL(T_FILL) RSF_FATAL(T_FILL) .endm /* * This is used to spill windows that are still occupied with user * data on kernel entry to the pcb. */ ENTRY(tl1_spill_topcb) wrpr %g0, PSTATE_ALT, %pstate /* Free some globals for our use. */ dec 24, ASP_REG stx %g1, [ASP_REG + 0] stx %g2, [ASP_REG + 8] stx %g3, [ASP_REG + 16] ldx [PCB_REG + PCB_NSAVED], %g1 sllx %g1, PTR_SHIFT, %g2 add %g2, PCB_REG, %g2 stx %sp, [%g2 + PCB_RWSP] sllx %g1, RW_SHIFT, %g2 add %g2, PCB_REG, %g2 SPILL(stx, %g2 + PCB_RW, 8, EMPTY) inc %g1 stx %g1, [PCB_REG + PCB_NSAVED] #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl1_spill_topcb: pc=%#lx npc=%#lx sp=%#lx nsaved=%d" , %g1, %g2, %g3, 7, 8, 9) rdpr %tpc, %g2 stx %g2, [%g1 + KTR_PARM1] rdpr %tnpc, %g2 stx %g2, [%g1 + KTR_PARM2] stx %sp, [%g1 + KTR_PARM3] ldx [PCB_REG + PCB_NSAVED], %g2 stx %g2, [%g1 + KTR_PARM4] 9: #endif saved ldx [ASP_REG + 16], %g3 ldx [ASP_REG + 8], %g2 ldx [ASP_REG + 0], %g1 inc 24, ASP_REG retry END(tl1_spill_topcb) .macro tl1_spill_bad count .rept \count sir .align 128 .endr .endm .macro tl1_fill_bad count .rept \count sir .align 128 .endr .endm .macro tl1_soft count .rept \count tl1_gen T_SOFT | T_KERNEL .endr .endm .sect .trap .globl tl_trap_begin tl_trap_begin: nop .align 0x8000 .globl tl0_base tl0_base: tl0_reserved 8 ! 0x0-0x7 tl0_insn_excptn: tl0_insn_excptn ! 0x8 tl0_reserved 1 ! 0x9 tl0_insn_error: tl0_gen T_INSTRUCTION_ERROR ! 0xa tl0_reserved 5 ! 0xb-0xf tl0_insn_illegal: tl0_gen T_ILLEGAL_INSTRUCTION ! 0x10 tl0_priv_opcode: tl0_gen T_PRIVILEGED_OPCODE ! 0x11 tl0_reserved 14 ! 0x12-0x1f tl0_fp_disabled: tl0_gen T_FP_DISABLED ! 0x20 tl0_fp_ieee: tl0_gen T_FP_EXCEPTION_IEEE_754 ! 0x21 tl0_fp_other: tl0_gen T_FP_EXCEPTION_OTHER ! 0x22 tl0_tag_ovflw: tl0_gen T_TAG_OVERFLOW ! 0x23 tl0_clean_window: clean_window ! 0x24 tl0_divide: tl0_gen T_DIVISION_BY_ZERO ! 0x28 tl0_reserved 7 ! 0x29-0x2f tl0_data_excptn: tl0_data_excptn ! 0x30 tl0_reserved 1 ! 0x31 tl0_data_error: tl0_gen T_DATA_ERROR ! 0x32 tl0_reserved 1 ! 0x33 tl0_align: tl0_align ! 0x34 tl0_align_lddf: tl0_gen T_RESERVED ! 0x35 tl0_align_stdf: tl0_gen T_RESERVED ! 0x36 tl0_priv_action: tl0_gen T_PRIVILEGED_ACTION ! 0x37 tl0_reserved 9 ! 0x38-0x40 tl0_intr_level: tl0_intr_level ! 0x41-0x4f tl0_reserved 16 ! 0x50-0x5f tl0_intr_vector: intr_vector ! 0x60 tl0_watch_phys: tl0_gen T_PA_WATCHPOINT ! 0x61 tl0_watch_virt: tl0_gen T_VA_WATCHPOINT ! 0x62 tl0_ecc: tl0_gen T_CORRECTED_ECC_ERROR ! 0x63 tl0_immu_miss: tl0_immu_miss ! 0x64 tl0_dmmu_miss: tl0_dmmu_miss ! 0x68 tl0_dmmu_prot: tl0_dmmu_prot ! 0x6c tl0_reserved 16 ! 0x70-0x7f tl0_spill_0_n: tl0_spill_0_n ! 0x80 tl0_spill_1_n: tl0_spill_1_n ! 0x84 tl0_spill_bad 14 ! 0x88-0xbf tl0_fill_0_n: tl0_fill_0_n ! 0xc0 tl0_fill_1_n: tl0_fill_1_n ! 0xc4 tl0_fill_bad 14 ! 0xc8-0xff tl0_soft: tl0_gen T_SYSCALL ! 0x100 tl0_gen T_BREAKPOINT ! 0x101 tl0_gen T_DIVISION_BY_ZERO ! 0x102 tl0_reserved 1 ! 0x103 tl0_gen T_CLEAN_WINDOW ! 0x104 tl0_gen T_RANGE_CHECK ! 0x105 tl0_gen T_FIX_ALIGNMENT ! 0x106 tl0_gen T_INTEGER_OVERFLOW ! 0x107 tl0_gen T_SYSCALL ! 0x108 tl0_gen T_SYSCALL ! 0x109 tl0_fp_restore ! 0x10a tl0_reserved 5 ! 0x10b-0x10f tl0_gen T_TRAP_INSTRUCTION_16 ! 0x110 tl0_gen T_TRAP_INSTRUCTION_17 ! 0x111 tl0_gen T_TRAP_INSTRUCTION_18 ! 0x112 tl0_gen T_TRAP_INSTRUCTION_19 ! 0x113 tl0_gen T_TRAP_INSTRUCTION_20 ! 0x114 tl0_gen T_TRAP_INSTRUCTION_21 ! 0x115 tl0_gen T_TRAP_INSTRUCTION_22 ! 0x116 tl0_gen T_TRAP_INSTRUCTION_23 ! 0x117 tl0_gen T_TRAP_INSTRUCTION_24 ! 0x118 tl0_gen T_TRAP_INSTRUCTION_25 ! 0x119 tl0_gen T_TRAP_INSTRUCTION_26 ! 0x11a tl0_gen T_TRAP_INSTRUCTION_27 ! 0x11b tl0_gen T_TRAP_INSTRUCTION_28 ! 0x11c tl0_gen T_TRAP_INSTRUCTION_29 ! 0x11d tl0_gen T_TRAP_INSTRUCTION_30 ! 0x11e tl0_gen T_TRAP_INSTRUCTION_31 ! 0x11f tl0_reserved 32 ! 0x120-0x13f tl0_gen T_SYSCALL ! 0x140 tl0_syscall ! 0x141 tl0_gen T_SYSCALL ! 0x142 tl0_gen T_SYSCALL ! 0x143 tl0_reserved 188 ! 0x144-0x1ff tl1_base: tl1_reserved 8 ! 0x200-0x207 tl1_insn_excptn: tl1_insn_excptn ! 0x208 tl1_reserved 1 ! 0x209 tl1_insn_error: tl1_gen T_INSTRUCTION_ERROR ! 0x20a tl1_reserved 5 ! 0x20b-0x20f tl1_insn_illegal: tl1_gen T_ILLEGAL_INSTRUCTION ! 0x210 tl1_priv_opcode: tl1_gen T_PRIVILEGED_OPCODE ! 0x211 tl1_reserved 14 ! 0x212-0x21f tl1_fp_disabled: tl1_fp_disabled ! 0x220 tl1_fp_ieee: tl1_gen T_FP_EXCEPTION_IEEE_754 ! 0x221 tl1_fp_other: tl1_gen T_FP_EXCEPTION_OTHER ! 0x222 tl1_tag_ovflw: tl1_gen T_TAG_OVERFLOW ! 0x223 tl1_clean_window: clean_window ! 0x224 tl1_divide: tl1_gen T_DIVISION_BY_ZERO ! 0x228 tl1_reserved 7 ! 0x229-0x22f tl1_data_excptn: tl1_data_excptn ! 0x230 tl1_reserved 1 ! 0x231 tl1_data_error: tl1_gen T_DATA_ERROR ! 0x232 tl1_reserved 1 ! 0x233 tl1_align: tl1_align ! 0x234 tl1_align_lddf: tl1_gen T_RESERVED ! 0x235 tl1_align_stdf: tl1_gen T_RESERVED ! 0x236 tl1_priv_action: tl1_gen T_PRIVILEGED_ACTION ! 0x237 tl1_reserved 9 ! 0x238-0x240 tl1_intr_level: tl1_intr_level ! 0x241-0x24f tl1_reserved 16 ! 0x250-0x25f tl1_intr_vector: intr_vector ! 0x260 tl1_watch_phys: tl1_gen T_PA_WATCHPOINT ! 0x261 tl1_watch_virt: tl1_gen T_VA_WATCHPOINT ! 0x262 tl1_ecc: tl1_gen T_CORRECTED_ECC_ERROR ! 0x263 tl1_immu_miss: tl1_immu_miss ! 0x264 tl1_dmmu_miss: tl1_dmmu_miss ! 0x268 tl1_dmmu_prot: tl1_dmmu_prot ! 0x26c tl1_reserved 16 ! 0x270-0x27f tl1_spill_0_n: tl1_spill_0_n ! 0x280 tl1_spill_bad 1 ! 0x284 tl1_spill_2_n: tl1_spill_2_n ! 0x288 tl1_spill_3_n: tl1_spill_3_n ! 0x28c tl1_spill_bad 3 ! 0x290-0x29b tl1_spill_7_n: tl1_spill_7_n ! 0x29c tl1_spill_0_o: tl1_spill_0_o ! 0x2a0 tl1_spill_1_o: tl1_spill_1_o ! 0x2a4 tl1_spill_2_o: tl1_spill_2_o ! 0x2a8 tl1_spill_bad 5 ! 0x2ac-0x2bf tl1_fill_0_n: tl1_fill_0_n ! 0x2c0 tl1_fill_bad 1 ! 0x2c4 tl1_fill_2_n: tl1_fill_2_n ! 0x2c8 tl1_fill_3_n: tl1_fill_3_n ! 0x2cc tl1_fill_bad 3 ! 0x2d0-0x2db tl1_fill_7_n: tl1_fill_7_n ! 0x2dc tl1_fill_bad 8 ! 0x2e0-0x2ff tl1_reserved 1 ! 0x300 tl1_breakpoint: tl1_gen T_BREAKPOINT ! 0x301 tl1_gen T_RSTRWP_PHYS ! 0x302 tl1_gen T_RSTRWP_VIRT ! 0x303 tl1_reserved 252 ! 0x304-0x3ff .globl tl_trap_end tl_trap_end: nop /* * User trap entry point * * void tl0_utrap(u_long type, u_long o1, u_long o2, u_long tar, u_long sfar, * u_long sfsr) * * This handles redirecting a trap back to usermode as a user trap. The user * program must have first registered a trap handler with the kernel using * sysarch(SPARC_UTRAP_INSTALL). The trap handler is passed enough state * for it to return to the trapping code directly, it will not return through * the kernel. The trap type is passed in %o0, all out registers must be * passed through to tl0_trap or to usermode untouched. Note that the * parameters passed in out registers may be used by the user trap handler. * Do not change the registers they are passed in or you will break the ABI. * * If the trap type allows user traps, setup state to execute the user trap * handler and bounce back to usermode, otherwise branch to tl0_trap. */ ENTRY(tl0_utrap) /* * Check if the trap type allows user traps. */ cmp %o0, UT_MAX bge,a,pt %xcc, tl0_trap nop /* * Load the user trap handler from the utrap table. */ ldx [PCPU(CURTHREAD)], %l0 ldx [%l0 + TD_PROC], %l0 ldx [%l0 + P_MD + MD_UTRAP], %l0 brz,pt %l0, tl0_trap sllx %o0, PTR_SHIFT, %l1 ldx [%l0 + %l1], %l0 brz,a,pt %l0, tl0_trap nop /* * If the save we did on entry to the kernel had to spill a window * to the pcb, pretend we took a spill trap instead. Any windows * that are in the pcb must be copied out or the fill handler will * not be able to find them, since the user trap handler returns * directly to the trapping code. Note that we only support precise * user traps, which implies that the condition that caused the trap * in the first place is still valid, so it will occur again when we * re-execute the trapping instruction. */ ldx [PCB_REG + PCB_NSAVED], %l1 brnz,a,pn %l1, tl0_trap mov T_SPILL, %o0 /* * Pass %fsr in %l4, %tstate in %l5, %tpc in %l6 and %tnpc in %l7. * The ABI specifies only %l6 and %l7, but we need to pass %fsr or * it may be clobbered by an interrupt before the user trap code * can read it, and we must pass %tstate in order to restore %ccr * and %asi. The %fsr must be stored to memory, so we use the * temporary stack for that. */ rd %fprs, %l1 or %l1, FPRS_FEF, %l2 wr %l2, 0, %fprs dec 8, ASP_REG stx %fsr, [ASP_REG] ldx [ASP_REG], %l4 inc 8, ASP_REG wr %l1, 0, %fprs rdpr %tstate, %l5 rdpr %tpc, %l6 rdpr %tnpc, %l7 /* * Setup %tnpc to return to. */ wrpr %l0, 0, %tnpc /* * Setup %wstate for return, clear WSTATE_TRANSITION. */ rdpr %wstate, %l1 and %l1, WSTATE_NORMAL_MASK, %l1 wrpr %l1, 0, %wstate /* * Setup %tstate for return, change the saved cwp to point to the * current window instead of the window at the time of the trap. */ andn %l5, TSTATE_CWP_MASK, %l1 rdpr %cwp, %l2 wrpr %l1, %l2, %tstate /* * Setup %sp. Userland processes will crash if this is not setup. */ sub %fp, CCFSZ, %sp /* * Execute the user trap handler. */ done END(tl0_utrap) /* * (Real) User trap entry point * * void tl0_trap(u_int type, u_long o1, u_long o2, u_long tar, u_long sfsr, * u_int sfsr) * * The following setup has been performed: * - the windows have been split and the active user window has been saved * (maybe just to the pcb) * - we are on alternate globals and interrupts are disabled * * We switch to the kernel stack, build a trapframe, switch to normal * globals, enable interrupts and call trap. * * NOTE: We must be very careful setting up the per-cpu pointer. We know that * it has been pre-set in alternate globals, so we read it from there and setup * the normal %g7 *before* enabling interrupts. This avoids any possibility * of cpu migration and using the wrong pcpup. */ ENTRY(tl0_trap) /* * Force kernel store order. */ wrpr %g0, PSTATE_ALT, %pstate rdpr %tstate, %l0 rdpr %tpc, %l1 rdpr %tnpc, %l2 rd %y, %l3 rd %fprs, %l4 rdpr %wstate, %l5 #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl0_trap: td=%p type=%#x pil=%#lx pc=%#lx npc=%#lx sp=%#lx" , %g1, %g2, %g3, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g2 stx %g2, [%g1 + KTR_PARM1] stx %o0, [%g1 + KTR_PARM2] rdpr %pil, %g2 stx %g2, [%g1 + KTR_PARM3] stx %l1, [%g1 + KTR_PARM4] stx %l2, [%g1 + KTR_PARM5] stx %i6, [%g1 + KTR_PARM6] 9: #endif 1: and %l5, WSTATE_NORMAL_MASK, %l5 sllx %l5, WSTATE_OTHER_SHIFT, %l5 wrpr %l5, WSTATE_KERNEL, %wstate rdpr %canrestore, %l6 wrpr %l6, 0, %otherwin wrpr %g0, 0, %canrestore sub PCB_REG, SPOFF + CCFSZ + TF_SIZEOF, %sp stx %o0, [%sp + SPOFF + CCFSZ + TF_TYPE] stx %o1, [%sp + SPOFF + CCFSZ + TF_LEVEL] stx %o3, [%sp + SPOFF + CCFSZ + TF_TAR] stx %o4, [%sp + SPOFF + CCFSZ + TF_SFAR] stx %o5, [%sp + SPOFF + CCFSZ + TF_SFSR] stx %l0, [%sp + SPOFF + CCFSZ + TF_TSTATE] stx %l1, [%sp + SPOFF + CCFSZ + TF_TPC] stx %l2, [%sp + SPOFF + CCFSZ + TF_TNPC] stx %l3, [%sp + SPOFF + CCFSZ + TF_Y] stx %l4, [%sp + SPOFF + CCFSZ + TF_FPRS] stx %l5, [%sp + SPOFF + CCFSZ + TF_WSTATE] wr %g0, FPRS_FEF, %fprs stx %fsr, [%sp + SPOFF + CCFSZ + TF_FSR] rd %gsr, %l6 stx %l6, [%sp + SPOFF + CCFSZ + TF_GSR] wr %g0, 0, %fprs mov PCB_REG, %l0 mov PCPU_REG, %l1 wrpr %g0, PSTATE_NORMAL, %pstate stx %g6, [%sp + SPOFF + CCFSZ + TF_G6] stx %g7, [%sp + SPOFF + CCFSZ + TF_G7] mov %l0, PCB_REG mov %l1, PCPU_REG wrpr %g0, PSTATE_KERNEL, %pstate stx %i0, [%sp + SPOFF + CCFSZ + TF_O0] stx %i1, [%sp + SPOFF + CCFSZ + TF_O1] stx %i2, [%sp + SPOFF + CCFSZ + TF_O2] stx %i3, [%sp + SPOFF + CCFSZ + TF_O3] stx %i4, [%sp + SPOFF + CCFSZ + TF_O4] stx %i5, [%sp + SPOFF + CCFSZ + TF_O5] stx %i6, [%sp + SPOFF + CCFSZ + TF_O6] stx %i7, [%sp + SPOFF + CCFSZ + TF_O7] stx %g1, [%sp + SPOFF + CCFSZ + TF_G1] stx %g2, [%sp + SPOFF + CCFSZ + TF_G2] stx %g3, [%sp + SPOFF + CCFSZ + TF_G3] stx %g4, [%sp + SPOFF + CCFSZ + TF_G4] stx %g5, [%sp + SPOFF + CCFSZ + TF_G5] set tl0_ret - 8, %o7 jmpl %o2, %g0 add %sp, CCFSZ + SPOFF, %o0 END(tl0_trap) /* * void tl0_intr(u_int level, u_int mask) */ ENTRY(tl0_intr) /* * Force kernel store order. */ wrpr %g0, PSTATE_ALT, %pstate rdpr %tstate, %l0 rdpr %tpc, %l1 rdpr %tnpc, %l2 rd %y, %l3 rd %fprs, %l4 rdpr %wstate, %l5 #if KTR_COMPILE & KTR_INTR CATR(KTR_INTR, "tl0_intr: td=%p level=%#x pil=%#lx pc=%#lx npc=%#lx sp=%#lx" , %g1, %g2, %g3, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g2 stx %g2, [%g1 + KTR_PARM1] stx %o0, [%g1 + KTR_PARM2] rdpr %pil, %g2 stx %g2, [%g1 + KTR_PARM3] stx %l1, [%g1 + KTR_PARM4] stx %l2, [%g1 + KTR_PARM5] stx %i6, [%g1 + KTR_PARM6] 9: #endif wrpr %o0, 0, %pil wr %o1, 0, %clear_softint and %l5, WSTATE_NORMAL_MASK, %l5 sllx %l5, WSTATE_OTHER_SHIFT, %l5 wrpr %l5, WSTATE_KERNEL, %wstate rdpr %canrestore, %l6 wrpr %l6, 0, %otherwin wrpr %g0, 0, %canrestore sub PCB_REG, SPOFF + CCFSZ + TF_SIZEOF, %sp stx %l0, [%sp + SPOFF + CCFSZ + TF_TSTATE] stx %l1, [%sp + SPOFF + CCFSZ + TF_TPC] stx %l2, [%sp + SPOFF + CCFSZ + TF_TNPC] stx %l3, [%sp + SPOFF + CCFSZ + TF_Y] stx %l4, [%sp + SPOFF + CCFSZ + TF_FPRS] stx %l5, [%sp + SPOFF + CCFSZ + TF_WSTATE] wr %g0, FPRS_FEF, %fprs stx %fsr, [%sp + SPOFF + CCFSZ + TF_FSR] rd %gsr, %l6 stx %l6, [%sp + SPOFF + CCFSZ + TF_GSR] wr %g0, 0, %fprs mov %o0, %l3 mov T_INTERRUPT, %o1 stx %o0, [%sp + SPOFF + CCFSZ + TF_LEVEL] stx %o1, [%sp + SPOFF + CCFSZ + TF_TYPE] mov PCB_REG, %l0 mov PCPU_REG, %l1 wrpr %g0, PSTATE_NORMAL, %pstate stx %g1, [%sp + SPOFF + CCFSZ + TF_G1] stx %g2, [%sp + SPOFF + CCFSZ + TF_G2] stx %g3, [%sp + SPOFF + CCFSZ + TF_G3] stx %g4, [%sp + SPOFF + CCFSZ + TF_G4] stx %g5, [%sp + SPOFF + CCFSZ + TF_G5] stx %g6, [%sp + SPOFF + CCFSZ + TF_G6] stx %g7, [%sp + SPOFF + CCFSZ + TF_G7] mov %l0, PCB_REG mov %l1, PCPU_REG wrpr %g0, PSTATE_KERNEL, %pstate stx %i0, [%sp + SPOFF + CCFSZ + TF_O0] stx %i1, [%sp + SPOFF + CCFSZ + TF_O1] stx %i2, [%sp + SPOFF + CCFSZ + TF_O2] stx %i3, [%sp + SPOFF + CCFSZ + TF_O3] stx %i4, [%sp + SPOFF + CCFSZ + TF_O4] stx %i5, [%sp + SPOFF + CCFSZ + TF_O5] stx %i6, [%sp + SPOFF + CCFSZ + TF_O6] stx %i7, [%sp + SPOFF + CCFSZ + TF_O7] SET(intr_handlers, %l1, %l0) sllx %l3, IH_SHIFT, %l1 ldx [%l0 + %l1], %l1 KASSERT(%l1, "tl0_intr: ih null") call %l1 add %sp, CCFSZ + SPOFF, %o0 /* %l3 contains PIL */ SET(intrcnt, %l1, %l2) prefetcha [%l2] ASI_N, 1 SET(pil_countp, %l1, %l0) sllx %l3, 1, %l1 lduh [%l0 + %l1], %l0 sllx %l0, 3, %l0 add %l0, %l2, %l0 ldx [%l0], %l1 inc %l1 stx %l1, [%l0] - lduw [PCPU(CNT) + V_INTR], %l0 - inc %l0 - stw %l0, [PCPU(CNT) + V_INTR] + call counter_intr_inc + nop ba,a %xcc, tl0_ret nop END(tl0_intr) /* * Initiate return to usermode. * * Called with a trapframe on the stack. The window that was setup in * tl0_trap may have been used by "fast" trap handlers that pretend to be * leaf functions, so all ins and locals may have been clobbered since * then. * * This code is rather long and complicated. */ ENTRY(tl0_ret) /* * Check for pending asts atomically with returning. We must raise * the PIL before checking, and if no asts are found the PIL must * remain raised until the retry is executed, or we risk missing asts * caused by interrupts occurring after the test. If the PIL is * lowered, as it is when we call ast, the check must be re-executed. */ wrpr %g0, PIL_TICK, %pil ldx [PCPU(CURTHREAD)], %l0 lduw [%l0 + TD_FLAGS], %l1 set TDF_ASTPENDING | TDF_NEEDRESCHED, %l2 and %l1, %l2, %l1 brz,a,pt %l1, 1f nop /* * We have an AST. Re-enable interrupts and handle it, then restart * the return sequence. */ wrpr %g0, 0, %pil call ast add %sp, CCFSZ + SPOFF, %o0 ba,a %xcc, tl0_ret nop /* * Check for windows that were spilled to the pcb and need to be * copied out. This must be the last thing that is done before the * return to usermode. If there are still user windows in the cpu * and we call a nested function after this, which causes them to be * spilled to the pcb, they will not be copied out and the stack will * be inconsistent. */ 1: ldx [PCB_REG + PCB_NSAVED], %l1 brz,a,pt %l1, 2f nop wrpr %g0, 0, %pil mov T_SPILL, %o0 stx %o0, [%sp + SPOFF + CCFSZ + TF_TYPE] call trap add %sp, SPOFF + CCFSZ, %o0 ba,a %xcc, tl0_ret nop /* * Restore the out and most global registers from the trapframe. * The ins will become the outs when we restore below. */ 2: ldx [%sp + SPOFF + CCFSZ + TF_O0], %i0 ldx [%sp + SPOFF + CCFSZ + TF_O1], %i1 ldx [%sp + SPOFF + CCFSZ + TF_O2], %i2 ldx [%sp + SPOFF + CCFSZ + TF_O3], %i3 ldx [%sp + SPOFF + CCFSZ + TF_O4], %i4 ldx [%sp + SPOFF + CCFSZ + TF_O5], %i5 ldx [%sp + SPOFF + CCFSZ + TF_O6], %i6 ldx [%sp + SPOFF + CCFSZ + TF_O7], %i7 ldx [%sp + SPOFF + CCFSZ + TF_G1], %g1 ldx [%sp + SPOFF + CCFSZ + TF_G2], %g2 ldx [%sp + SPOFF + CCFSZ + TF_G3], %g3 ldx [%sp + SPOFF + CCFSZ + TF_G4], %g4 ldx [%sp + SPOFF + CCFSZ + TF_G5], %g5 /* * Load everything we need to restore below before disabling * interrupts. */ ldx [%sp + SPOFF + CCFSZ + TF_FPRS], %l0 ldx [%sp + SPOFF + CCFSZ + TF_GSR], %l1 ldx [%sp + SPOFF + CCFSZ + TF_TNPC], %l2 ldx [%sp + SPOFF + CCFSZ + TF_TPC], %l3 ldx [%sp + SPOFF + CCFSZ + TF_TSTATE], %l4 ldx [%sp + SPOFF + CCFSZ + TF_Y], %l5 ldx [%sp + SPOFF + CCFSZ + TF_WSTATE], %l6 /* * Disable interrupts to restore the special globals. They are not * saved and restored for all kernel traps, so an interrupt at the * wrong time would clobber them. */ wrpr %g0, PSTATE_NORMAL, %pstate ldx [%sp + SPOFF + CCFSZ + TF_G6], %g6 ldx [%sp + SPOFF + CCFSZ + TF_G7], %g7 /* * Switch to alternate globals. This frees up some registers we * can use after the restore changes our window. */ wrpr %g0, PSTATE_ALT, %pstate /* * Drop %pil to zero. It must have been zero at the time of the * trap, since we were in usermode, but it was raised above in * order to check for asts atomically. We have interrupts disabled * so any interrupts will not be serviced until we complete the * return to usermode. */ wrpr %g0, 0, %pil /* * Save %fprs in an alternate global so it can be restored after the * restore instruction below. If we restore it before the restore, * and the restore traps we may run for a while with floating point * enabled in the kernel, which we want to avoid. */ mov %l0, %g1 /* * Restore %fsr and %gsr. These need floating point enabled in %fprs, * so we set it temporarily and then clear it. */ wr %g0, FPRS_FEF, %fprs ldx [%sp + SPOFF + CCFSZ + TF_FSR], %fsr wr %l1, 0, %gsr wr %g0, 0, %fprs /* * Restore program counters. This could be done after the restore * but we're out of alternate globals to store them in... */ wrpr %l2, 0, %tnpc wrpr %l3, 0, %tpc /* * Save %tstate in an alternate global and clear the %cwp field. %cwp * will be affected by the restore below and we need to make sure it * points to the current window at that time, not the window that was * active at the time of the trap. */ andn %l4, TSTATE_CWP_MASK, %g2 /* * Restore %y. Could also be below if we had more alternate globals. */ wr %l5, 0, %y /* * Setup %wstate for return. We need to restore the user window state * which we saved in wstate.other when we trapped. We also need to * set the transition bit so the restore will be handled specially * if it traps, use the xor feature of wrpr to do that. */ srlx %l6, WSTATE_OTHER_SHIFT, %g3 wrpr %g3, WSTATE_TRANSITION, %wstate /* * Setup window management registers for return. If not all user * windows were spilled in the kernel %otherwin will be non-zero, * so we need to transfer it to %canrestore to correctly restore * those windows. Otherwise everything gets set to zero and the * restore below will fill a window directly from the user stack. */ rdpr %otherwin, %o0 wrpr %o0, 0, %canrestore wrpr %g0, 0, %otherwin wrpr %o0, 0, %cleanwin /* * Now do the restore. If this instruction causes a fill trap which * fails to fill a window from the user stack, we will resume at * tl0_ret_fill_end and call back into the kernel. */ restore tl0_ret_fill: /* * We made it. We're back in the window that was active at the time * of the trap, and ready to return to usermode. */ /* * Restore %frps. This was saved in an alternate global above. */ wr %g1, 0, %fprs /* * Fixup %tstate so the saved %cwp points to the current window and * restore it. */ rdpr %cwp, %g4 wrpr %g2, %g4, %tstate /* * Restore the user window state. The transition bit was set above * for special handling of the restore, this clears it. */ wrpr %g3, 0, %wstate #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl0_ret: td=%#lx pil=%#lx pc=%#lx npc=%#lx sp=%#lx" , %g2, %g3, %g4, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g3 stx %g3, [%g2 + KTR_PARM1] rdpr %pil, %g3 stx %g3, [%g2 + KTR_PARM2] rdpr %tpc, %g3 stx %g3, [%g2 + KTR_PARM3] rdpr %tnpc, %g3 stx %g3, [%g2 + KTR_PARM4] stx %sp, [%g2 + KTR_PARM5] 9: #endif /* * Return to usermode. */ retry tl0_ret_fill_end: #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl0_ret: fill magic ps=%#lx ws=%#lx sp=%#lx" , %l0, %l1, %l2, 7, 8, 9) rdpr %pstate, %l1 stx %l1, [%l0 + KTR_PARM1] stx %l6, [%l0 + KTR_PARM2] stx %sp, [%l0 + KTR_PARM3] 9: #endif /* * The restore above caused a fill trap and the fill handler was * unable to fill a window from the user stack. The special fill * handler recognized this and punted, sending us here. We need * to carefully undo any state that was restored before the restore * was executed and call trap again. Trap will copyin a window * from the user stack which will fault in the page we need so the * restore above will succeed when we try again. If this fails * the process has trashed its stack, so we kill it. */ /* * Restore the kernel window state. This was saved in %l6 above, and * since the restore failed we're back in the same window. */ wrpr %l6, 0, %wstate /* * Restore the normal globals which have predefined values in the * kernel. We clobbered them above restoring the user's globals * so this is very important. * XXX PSTATE_ALT must already be set. */ wrpr %g0, PSTATE_ALT, %pstate mov PCB_REG, %o0 mov PCPU_REG, %o1 wrpr %g0, PSTATE_NORMAL, %pstate mov %o0, PCB_REG mov %o1, PCPU_REG wrpr %g0, PSTATE_KERNEL, %pstate /* * Simulate a fill trap and then start the whole return sequence over * again. This is special because it only copies in 1 window, not 2 * as we would for a normal failed fill. This may be the first time * the process has been run, so there may not be 2 windows worth of * stack to copyin. */ mov T_FILL_RET, %o0 stx %o0, [%sp + SPOFF + CCFSZ + TF_TYPE] call trap add %sp, SPOFF + CCFSZ, %o0 ba,a %xcc, tl0_ret nop END(tl0_ret) /* * Kernel trap entry point * * void tl1_trap(u_int type, u_long o1, u_long o2, u_long tar, u_long sfar, * u_int sfsr) * * This is easy because the stack is already setup and the windows don't need * to be split. We build a trapframe and call trap(), the same as above, but * the outs don't need to be saved. */ ENTRY(tl1_trap) rdpr %tstate, %l0 rdpr %tpc, %l1 rdpr %tnpc, %l2 rdpr %pil, %l3 rd %y, %l4 rdpr %wstate, %l5 #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl1_trap: td=%p type=%#lx pil=%#lx pc=%#lx sp=%#lx" , %g1, %g2, %g3, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g2 stx %g2, [%g1 + KTR_PARM1] stx %o0, [%g1 + KTR_PARM2] stx %l3, [%g1 + KTR_PARM3] stx %l1, [%g1 + KTR_PARM4] stx %i6, [%g1 + KTR_PARM5] 9: #endif wrpr %g0, 1, %tl and %l5, WSTATE_OTHER_MASK, %l5 wrpr %l5, WSTATE_KERNEL, %wstate stx %o0, [%sp + SPOFF + CCFSZ + TF_TYPE] stx %o1, [%sp + SPOFF + CCFSZ + TF_LEVEL] stx %o3, [%sp + SPOFF + CCFSZ + TF_TAR] stx %o4, [%sp + SPOFF + CCFSZ + TF_SFAR] stx %o5, [%sp + SPOFF + CCFSZ + TF_SFSR] stx %l0, [%sp + SPOFF + CCFSZ + TF_TSTATE] stx %l1, [%sp + SPOFF + CCFSZ + TF_TPC] stx %l2, [%sp + SPOFF + CCFSZ + TF_TNPC] stx %l3, [%sp + SPOFF + CCFSZ + TF_PIL] stx %l4, [%sp + SPOFF + CCFSZ + TF_Y] mov PCB_REG, %l0 mov PCPU_REG, %l1 wrpr %g0, PSTATE_NORMAL, %pstate stx %g6, [%sp + SPOFF + CCFSZ + TF_G6] stx %g7, [%sp + SPOFF + CCFSZ + TF_G7] mov %l0, PCB_REG mov %l1, PCPU_REG wrpr %g0, PSTATE_KERNEL, %pstate stx %i0, [%sp + SPOFF + CCFSZ + TF_O0] stx %i1, [%sp + SPOFF + CCFSZ + TF_O1] stx %i2, [%sp + SPOFF + CCFSZ + TF_O2] stx %i3, [%sp + SPOFF + CCFSZ + TF_O3] stx %i4, [%sp + SPOFF + CCFSZ + TF_O4] stx %i5, [%sp + SPOFF + CCFSZ + TF_O5] stx %i6, [%sp + SPOFF + CCFSZ + TF_O6] stx %i7, [%sp + SPOFF + CCFSZ + TF_O7] stx %g1, [%sp + SPOFF + CCFSZ + TF_G1] stx %g2, [%sp + SPOFF + CCFSZ + TF_G2] stx %g3, [%sp + SPOFF + CCFSZ + TF_G3] stx %g4, [%sp + SPOFF + CCFSZ + TF_G4] stx %g5, [%sp + SPOFF + CCFSZ + TF_G5] set tl1_ret - 8, %o7 jmpl %o2, %g0 add %sp, CCFSZ + SPOFF, %o0 END(tl1_trap) ENTRY(tl1_ret) ldx [%sp + SPOFF + CCFSZ + TF_O0], %i0 ldx [%sp + SPOFF + CCFSZ + TF_O1], %i1 ldx [%sp + SPOFF + CCFSZ + TF_O2], %i2 ldx [%sp + SPOFF + CCFSZ + TF_O3], %i3 ldx [%sp + SPOFF + CCFSZ + TF_O4], %i4 ldx [%sp + SPOFF + CCFSZ + TF_O5], %i5 ldx [%sp + SPOFF + CCFSZ + TF_O6], %i6 ldx [%sp + SPOFF + CCFSZ + TF_O7], %i7 ldx [%sp + SPOFF + CCFSZ + TF_G1], %g1 ldx [%sp + SPOFF + CCFSZ + TF_G2], %g2 ldx [%sp + SPOFF + CCFSZ + TF_G3], %g3 ldx [%sp + SPOFF + CCFSZ + TF_G4], %g4 ldx [%sp + SPOFF + CCFSZ + TF_G5], %g5 ldx [%sp + SPOFF + CCFSZ + TF_TSTATE], %l0 ldx [%sp + SPOFF + CCFSZ + TF_TPC], %l1 ldx [%sp + SPOFF + CCFSZ + TF_TNPC], %l2 ldx [%sp + SPOFF + CCFSZ + TF_PIL], %l3 ldx [%sp + SPOFF + CCFSZ + TF_Y], %l4 set VM_MIN_PROM_ADDRESS, %l5 cmp %l1, %l5 bl,a,pt %xcc, 1f nop set VM_MAX_PROM_ADDRESS, %l5 cmp %l1, %l5 bg,a,pt %xcc, 1f nop wrpr %g0, PSTATE_NORMAL, %pstate ldx [%sp + SPOFF + CCFSZ + TF_G6], %g6 ldx [%sp + SPOFF + CCFSZ + TF_G7], %g7 1: wrpr %g0, PSTATE_ALT, %pstate andn %l0, TSTATE_CWP_MASK, %g1 mov %l1, %g2 mov %l2, %g3 wrpr %l3, 0, %pil wr %l4, 0, %y restore wrpr %g0, 2, %tl rdpr %cwp, %g4 wrpr %g1, %g4, %tstate wrpr %g2, 0, %tpc wrpr %g3, 0, %tnpc #if KTR_COMPILE & KTR_TRAP CATR(KTR_TRAP, "tl1_ret: td=%#lx pil=%#lx ts=%#lx pc=%#lx sp=%#lx" , %g2, %g3, %g4, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g3 stx %g3, [%g2 + KTR_PARM1] rdpr %pil, %g3 stx %g3, [%g2 + KTR_PARM2] rdpr %tstate, %g3 stx %g3, [%g2 + KTR_PARM3] rdpr %tpc, %g3 stx %g3, [%g2 + KTR_PARM4] stx %sp, [%g2 + KTR_PARM5] 9: #endif retry END(tl1_ret) /* * void tl1_intr(u_int level, u_int mask) */ ENTRY(tl1_intr) rdpr %tstate, %l0 rdpr %tpc, %l1 rdpr %tnpc, %l2 rdpr %pil, %l3 rd %y, %l4 rdpr %wstate, %l5 #if KTR_COMPILE & KTR_INTR CATR(KTR_INTR, "tl1_intr: td=%p level=%#x pil=%#lx pc=%#lx sp=%#lx" , %g1, %g2, %g3, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g2 stx %g2, [%g1 + KTR_PARM1] stx %o0, [%g1 + KTR_PARM2] stx %l3, [%g1 + KTR_PARM3] stx %l1, [%g1 + KTR_PARM4] stx %i6, [%g1 + KTR_PARM5] 9: #endif wrpr %o0, 0, %pil wr %o1, 0, %clear_softint wrpr %g0, 1, %tl and %l5, WSTATE_OTHER_MASK, %l5 wrpr %l5, WSTATE_KERNEL, %wstate stx %l0, [%sp + SPOFF + CCFSZ + TF_TSTATE] stx %l1, [%sp + SPOFF + CCFSZ + TF_TPC] stx %l2, [%sp + SPOFF + CCFSZ + TF_TNPC] stx %l3, [%sp + SPOFF + CCFSZ + TF_PIL] stx %l4, [%sp + SPOFF + CCFSZ + TF_Y] mov %o0, %l7 mov T_INTERRUPT | T_KERNEL, %o1 stx %o0, [%sp + SPOFF + CCFSZ + TF_LEVEL] stx %o1, [%sp + SPOFF + CCFSZ + TF_TYPE] stx %i6, [%sp + SPOFF + CCFSZ + TF_O6] stx %i7, [%sp + SPOFF + CCFSZ + TF_O7] mov PCB_REG, %l4 mov PCPU_REG, %l5 wrpr %g0, PSTATE_NORMAL, %pstate stx %g1, [%sp + SPOFF + CCFSZ + TF_G1] stx %g2, [%sp + SPOFF + CCFSZ + TF_G2] stx %g3, [%sp + SPOFF + CCFSZ + TF_G3] stx %g4, [%sp + SPOFF + CCFSZ + TF_G4] stx %g5, [%sp + SPOFF + CCFSZ + TF_G5] mov %l4, PCB_REG mov %l5, PCPU_REG wrpr %g0, PSTATE_KERNEL, %pstate SET(intr_handlers, %l5, %l4) sllx %l7, IH_SHIFT, %l5 ldx [%l4 + %l5], %l5 KASSERT(%l5, "tl1_intr: ih null") call %l5 add %sp, CCFSZ + SPOFF, %o0 /* %l7 contains PIL */ SET(intrcnt, %l5, %l4) prefetcha [%l4] ASI_N, 1 SET(pil_countp, %l5, %l6) sllx %l7, 1, %l5 lduh [%l5 + %l6], %l5 sllx %l5, 3, %l5 add %l5, %l4, %l4 ldx [%l4], %l5 inc %l5 - stx %l5, [%l4] - - lduw [PCPU(CNT) + V_INTR], %l4 - inc %l4 - stw %l4, [PCPU(CNT) + V_INTR] + call counter_intr_inc + stx %l5, [%l4] ldx [%sp + SPOFF + CCFSZ + TF_Y], %l4 ldx [%sp + SPOFF + CCFSZ + TF_G1], %g1 ldx [%sp + SPOFF + CCFSZ + TF_G2], %g2 ldx [%sp + SPOFF + CCFSZ + TF_G3], %g3 ldx [%sp + SPOFF + CCFSZ + TF_G4], %g4 ldx [%sp + SPOFF + CCFSZ + TF_G5], %g5 wrpr %g0, PSTATE_ALT, %pstate andn %l0, TSTATE_CWP_MASK, %g1 mov %l1, %g2 mov %l2, %g3 wrpr %l3, 0, %pil wr %l4, 0, %y restore wrpr %g0, 2, %tl rdpr %cwp, %g4 wrpr %g1, %g4, %tstate wrpr %g2, 0, %tpc wrpr %g3, 0, %tnpc #if KTR_COMPILE & KTR_INTR CATR(KTR_INTR, "tl1_intr: td=%#x pil=%#lx ts=%#lx pc=%#lx sp=%#lx" , %g2, %g3, %g4, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g3 stx %g3, [%g2 + KTR_PARM1] rdpr %pil, %g3 stx %g3, [%g2 + KTR_PARM2] rdpr %tstate, %g3 stx %g3, [%g2 + KTR_PARM3] rdpr %tpc, %g3 stx %g3, [%g2 + KTR_PARM4] stx %sp, [%g2 + KTR_PARM5] 9: #endif retry END(tl1_intr) .globl tl_text_end tl_text_end: nop /* * Freshly forked processes come here when switched to for the first time. * The arguments to fork_exit() have been setup in the locals, we must move * them to the outs. */ ENTRY(fork_trampoline) #if KTR_COMPILE & KTR_PROC CATR(KTR_PROC, "fork_trampoline: td=%p (%s) cwp=%#lx" , %g1, %g2, %g3, 7, 8, 9) ldx [PCPU(CURTHREAD)], %g2 stx %g2, [%g1 + KTR_PARM1] ldx [%g2 + TD_PROC], %g2 add %g2, P_COMM, %g2 stx %g2, [%g1 + KTR_PARM2] rdpr %cwp, %g2 stx %g2, [%g1 + KTR_PARM3] 9: #endif mov %l0, %o0 mov %l1, %o1 call fork_exit mov %l2, %o2 ba,a %xcc, tl0_ret nop END(fork_trampoline) Index: head/sys/sparc64/sparc64/genassym.c =================================================================== --- head/sys/sparc64/sparc64/genassym.c (revision 317060) +++ head/sys/sparc64/sparc64/genassym.c (revision 317061) @@ -1,245 +1,244 @@ /*- * Copyright (c) 2001 Jake Burkholder. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include ASSYM(KERNBASE, KERNBASE); ASSYM(KSTACK_PAGES, KSTACK_PAGES); ASSYM(PCPU_PAGES, PCPU_PAGES); ASSYM(TAR_VPN_SHIFT, TAR_VPN_SHIFT); ASSYM(_NCPUBITS, _NCPUBITS); ASSYM(TLB_DEMAP_ALL, TLB_DEMAP_ALL); ASSYM(TLB_DEMAP_CONTEXT, TLB_DEMAP_CONTEXT); ASSYM(TLB_DEMAP_NUCLEUS, TLB_DEMAP_NUCLEUS); ASSYM(TLB_DEMAP_PAGE, TLB_DEMAP_PAGE); ASSYM(TLB_DEMAP_PRIMARY, TLB_DEMAP_PRIMARY); ASSYM(INT_SHIFT, INT_SHIFT); ASSYM(PTR_SHIFT, PTR_SHIFT); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_SHIFT_8K, PAGE_SHIFT_8K); ASSYM(PAGE_SHIFT_4M, PAGE_SHIFT_4M); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(PAGE_SIZE_4M, PAGE_SIZE_4M); #ifdef SMP ASSYM(CSA_PCPU, offsetof(struct cpu_start_args, csa_pcpu)); ASSYM(CSA_STATE, offsetof(struct cpu_start_args, csa_state)); ASSYM(CSA_MID, offsetof(struct cpu_start_args, csa_mid)); ASSYM(CSA_STICK, offsetof(struct cpu_start_args, csa_stick)); ASSYM(CSA_TICK, offsetof(struct cpu_start_args, csa_tick)); ASSYM(CSA_TTES, offsetof(struct cpu_start_args, csa_ttes)); ASSYM(CSA_VER, offsetof(struct cpu_start_args, csa_ver)); #endif ASSYM(DC_SIZE, offsetof(struct cacheinfo, dc_size)); ASSYM(DC_LINESIZE, offsetof(struct cacheinfo, dc_linesize)); ASSYM(IC_SIZE, offsetof(struct cacheinfo, ic_size)); ASSYM(IC_LINESIZE, offsetof(struct cacheinfo, ic_linesize)); ASSYM(KTR_SIZEOF, sizeof(struct ktr_entry)); ASSYM(KTR_LINE, offsetof(struct ktr_entry, ktr_line)); ASSYM(KTR_FILE, offsetof(struct ktr_entry, ktr_file)); ASSYM(KTR_DESC, offsetof(struct ktr_entry, ktr_desc)); ASSYM(KTR_CPU, offsetof(struct ktr_entry, ktr_cpu)); ASSYM(KTR_TIMESTAMP, offsetof(struct ktr_entry, ktr_timestamp)); ASSYM(KTR_PARM1, offsetof(struct ktr_entry, ktr_parms[0])); ASSYM(KTR_PARM2, offsetof(struct ktr_entry, ktr_parms[1])); ASSYM(KTR_PARM3, offsetof(struct ktr_entry, ktr_parms[2])); ASSYM(KTR_PARM4, offsetof(struct ktr_entry, ktr_parms[3])); ASSYM(KTR_PARM5, offsetof(struct ktr_entry, ktr_parms[4])); ASSYM(KTR_PARM6, offsetof(struct ktr_entry, ktr_parms[5])); ASSYM(TTE_SHIFT, TTE_SHIFT); ASSYM(TTE_VPN, offsetof(struct tte, tte_vpn)); ASSYM(TTE_DATA, offsetof(struct tte, tte_data)); ASSYM(TD_V, TD_V); ASSYM(TD_EXEC, TD_EXEC); ASSYM(TD_REF, TD_REF); ASSYM(TD_SW, TD_SW); ASSYM(TD_L, TD_L); ASSYM(TD_CP, TD_CP); ASSYM(TD_CV, TD_CV); ASSYM(TD_W, TD_W); ASSYM(TS_MIN, TS_MIN); ASSYM(TS_MAX, TS_MAX); ASSYM(TLB_DAR_SLOT_SHIFT, TLB_DAR_SLOT_SHIFT); ASSYM(TLB_CXR_PGSZ_MASK, TLB_CXR_PGSZ_MASK); ASSYM(TLB_DIRECT_ADDRESS_MASK, TLB_DIRECT_ADDRESS_MASK); ASSYM(TLB_DIRECT_TO_TTE_MASK, TLB_DIRECT_TO_TTE_MASK); ASSYM(TV_SIZE_BITS, TV_SIZE_BITS); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_IRHEAD, offsetof(struct pcpu, pc_irhead)); ASSYM(PC_IRTAIL, offsetof(struct pcpu, pc_irtail)); ASSYM(PC_IRFREE, offsetof(struct pcpu, pc_irfree)); -ASSYM(PC_CNT, offsetof(struct pcpu, pc_cnt)); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_CACHE, offsetof(struct pcpu, pc_cache)); ASSYM(PC_MID, offsetof(struct pcpu, pc_mid)); ASSYM(PC_PMAP, offsetof(struct pcpu, pc_pmap)); ASSYM(PC_TLB_CTX, offsetof(struct pcpu, pc_tlb_ctx)); ASSYM(PC_TLB_CTX_MAX, offsetof(struct pcpu, pc_tlb_ctx_max)); ASSYM(PC_TLB_CTX_MIN, offsetof(struct pcpu, pc_tlb_ctx_min)); ASSYM(IR_NEXT, offsetof(struct intr_request, ir_next)); ASSYM(IR_FUNC, offsetof(struct intr_request, ir_func)); ASSYM(IR_ARG, offsetof(struct intr_request, ir_arg)); ASSYM(IR_PRI, offsetof(struct intr_request, ir_pri)); ASSYM(IR_VEC, offsetof(struct intr_request, ir_vec)); #ifdef SMP ASSYM(ICA_PA, offsetof(struct ipi_cache_args, ica_pa)); ASSYM(IRA_MASK, offsetof(struct ipi_rd_args, ira_mask)); ASSYM(IRA_VAL, offsetof(struct ipi_rd_args, ira_val)); ASSYM(ITA_MASK, offsetof(struct ipi_tlb_args, ita_mask)); ASSYM(ITA_PMAP, offsetof(struct ipi_tlb_args, ita_pmap)); ASSYM(ITA_START, offsetof(struct ipi_tlb_args, ita_start)); ASSYM(ITA_END, offsetof(struct ipi_tlb_args, ita_end)); ASSYM(ITA_VA, offsetof(struct ipi_tlb_args, ita_va)); #endif ASSYM(IV_FUNC, offsetof(struct intr_vector, iv_func)); ASSYM(IV_ARG, offsetof(struct intr_vector, iv_arg)); ASSYM(IV_PRI, offsetof(struct intr_vector, iv_pri)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(MD_UTRAP, offsetof(struct mdproc, md_utrap)); ASSYM(P_COMM, offsetof(struct proc, p_comm)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(P_PID, offsetof(struct proc, p_pid)); ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_FRAME, offsetof(struct thread, td_frame)); ASSYM(TD_KSTACK, offsetof(struct thread, td_kstack)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(MD_SAVED_PIL, offsetof(struct mdthread, md_saved_pil)); ASSYM(PCB_SIZEOF, sizeof(struct pcb)); ASSYM(PCB_RW, offsetof(struct pcb, pcb_rw)); ASSYM(PCB_KFP, offsetof(struct pcb, pcb_kfp)); ASSYM(PCB_UFP, offsetof(struct pcb, pcb_ufp)); ASSYM(PCB_RWSP, offsetof(struct pcb, pcb_rwsp)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_NSAVED, offsetof(struct pcb, pcb_nsaved)); ASSYM(PCB_PC, offsetof(struct pcb, pcb_pc)); ASSYM(PCB_SP, offsetof(struct pcb, pcb_sp)); ASSYM(PCB_PAD, offsetof(struct pcb, pcb_pad)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(PM_CONTEXT, offsetof(struct pmap, pm_context)); ASSYM(PM_TSB, offsetof(struct pmap, pm_tsb)); ASSYM(_JB_FP, offsetof(struct _jmp_buf, _jb[_JB_FP])); ASSYM(_JB_PC, offsetof(struct _jmp_buf, _jb[_JB_PC])); ASSYM(_JB_SP, offsetof(struct _jmp_buf, _jb[_JB_SP])); ASSYM(_JB_SIGFLAG, offsetof(struct _jmp_buf, _jb[_JB_SIGFLAG])); ASSYM(_JB_SIGMASK, offsetof(struct _jmp_buf, _jb[_JB_SIGMASK])); ASSYM(TF_G0, offsetof(struct trapframe, tf_global[0])); ASSYM(TF_G1, offsetof(struct trapframe, tf_global[1])); ASSYM(TF_G2, offsetof(struct trapframe, tf_global[2])); ASSYM(TF_G3, offsetof(struct trapframe, tf_global[3])); ASSYM(TF_G4, offsetof(struct trapframe, tf_global[4])); ASSYM(TF_G5, offsetof(struct trapframe, tf_global[5])); ASSYM(TF_G6, offsetof(struct trapframe, tf_global[6])); ASSYM(TF_G7, offsetof(struct trapframe, tf_global[7])); ASSYM(TF_O0, offsetof(struct trapframe, tf_out[0])); ASSYM(TF_O1, offsetof(struct trapframe, tf_out[1])); ASSYM(TF_O2, offsetof(struct trapframe, tf_out[2])); ASSYM(TF_O3, offsetof(struct trapframe, tf_out[3])); ASSYM(TF_O4, offsetof(struct trapframe, tf_out[4])); ASSYM(TF_O5, offsetof(struct trapframe, tf_out[5])); ASSYM(TF_O6, offsetof(struct trapframe, tf_out[6])); ASSYM(TF_O7, offsetof(struct trapframe, tf_out[7])); ASSYM(TF_FPRS, offsetof(struct trapframe, tf_fprs)); ASSYM(TF_FSR, offsetof(struct trapframe, tf_fsr)); ASSYM(TF_GSR, offsetof(struct trapframe, tf_gsr)); ASSYM(TF_PIL, offsetof(struct trapframe, tf_pil)); ASSYM(TF_LEVEL, offsetof(struct trapframe, tf_level)); ASSYM(TF_SFAR, offsetof(struct trapframe, tf_sfar)); ASSYM(TF_SFSR, offsetof(struct trapframe, tf_sfsr)); ASSYM(TF_TAR, offsetof(struct trapframe, tf_tar)); ASSYM(TF_TYPE, offsetof(struct trapframe, tf_type)); ASSYM(TF_Y, offsetof(struct trapframe, tf_y)); ASSYM(TF_TNPC, offsetof(struct trapframe, tf_tnpc)); ASSYM(TF_TPC, offsetof(struct trapframe, tf_tpc)); ASSYM(TF_TSTATE, offsetof(struct trapframe, tf_tstate)); ASSYM(TF_WSTATE, offsetof(struct trapframe, tf_wstate)); ASSYM(TF_SIZEOF, sizeof(struct trapframe)); ASSYM(VM_MIN_DIRECT_ADDRESS, VM_MIN_DIRECT_ADDRESS); ASSYM(VM_MIN_PROM_ADDRESS, VM_MIN_PROM_ADDRESS); ASSYM(VM_MAX_PROM_ADDRESS, VM_MAX_PROM_ADDRESS); Index: head/sys/sparc64/sparc64/intr_machdep.c =================================================================== --- head/sys/sparc64/sparc64/intr_machdep.c (revision 317060) +++ head/sys/sparc64/sparc64/intr_machdep.c (revision 317061) @@ -1,558 +1,572 @@ /*- * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 2001 Jake Burkholder. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * form: src/sys/i386/isa/intr_machdep.c,v 1.57 2001/07/20 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_STRAY_LOG 5 CTASSERT((1 << IV_SHIFT) == sizeof(struct intr_vector)); ih_func_t *intr_handlers[PIL_MAX]; uint16_t pil_countp[PIL_MAX]; static uint16_t pil_stray_count[PIL_MAX]; struct intr_vector intr_vectors[IV_MAX]; uint16_t intr_countp[IV_MAX]; static uint16_t intr_stray_count[IV_MAX]; static const char *const pil_names[] = { "stray", "low", /* PIL_LOW */ "preempt", /* PIL_PREEMPT */ "ithrd", /* PIL_ITHREAD */ "rndzvs", /* PIL_RENDEZVOUS */ "ast", /* PIL_AST */ "hardclock", /* PIL_HARDCLOCK */ "stray", "stray", "stray", "stray", "filter", /* PIL_FILTER */ "bridge", /* PIL_BRIDGE */ "stop", /* PIL_STOP */ "tick", /* PIL_TICK */ }; /* protect the intr_vectors table */ static struct sx intr_table_lock; /* protect intrcnt_index */ static struct mtx intrcnt_lock; #ifdef SMP static int assign_cpu; static void intr_assign_next_cpu(struct intr_vector *iv); static void intr_shuffle_irqs(void *arg __unused); #endif static int intr_assign_cpu(void *arg, int cpu); static void intr_execute_handlers(void *); static void intr_stray_level(struct trapframe *); static void intr_stray_vector(void *); static int intrcnt_setname(const char *, int); static void intrcnt_updatename(int, const char *, int); +void counter_intr_inc(void); static void intrcnt_updatename(int vec, const char *name, int ispil) { static int intrcnt_index, stray_pil_index, stray_vec_index; int name_index; mtx_lock_spin(&intrcnt_lock); if (intrnames[0] == '\0') { /* for bitbucket */ if (bootverbose) printf("initalizing intr_countp\n"); intrcnt_setname("???", intrcnt_index++); stray_vec_index = intrcnt_index++; intrcnt_setname("stray", stray_vec_index); for (name_index = 0; name_index < IV_MAX; name_index++) intr_countp[name_index] = stray_vec_index; stray_pil_index = intrcnt_index++; intrcnt_setname("pil", stray_pil_index); for (name_index = 0; name_index < PIL_MAX; name_index++) pil_countp[name_index] = stray_pil_index; } if (name == NULL) name = "???"; if (!ispil && intr_countp[vec] != stray_vec_index) name_index = intr_countp[vec]; else if (ispil && pil_countp[vec] != stray_pil_index) name_index = pil_countp[vec]; else name_index = intrcnt_index++; if (intrcnt_setname(name, name_index)) name_index = 0; if (!ispil) intr_countp[vec] = name_index; else pil_countp[vec] = name_index; mtx_unlock_spin(&intrcnt_lock); } static int intrcnt_setname(const char *name, int index) { if ((MAXCOMLEN + 1) * index >= sintrnames) return (E2BIG); snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s", MAXCOMLEN, name); return (0); } void intr_setup(int pri, ih_func_t *ihf, int vec, iv_func_t *ivf, void *iva) { char pilname[MAXCOMLEN + 1]; register_t s; s = intr_disable(); if (vec != -1) { intr_vectors[vec].iv_func = ivf; intr_vectors[vec].iv_arg = iva; intr_vectors[vec].iv_pri = pri; intr_vectors[vec].iv_vec = vec; } intr_handlers[pri] = ihf; intr_restore(s); snprintf(pilname, MAXCOMLEN + 1, "pil%d: %s", pri, pil_names[pri]); intrcnt_updatename(pri, pilname, 1); } static void intr_stray_level(struct trapframe *tf) { uint64_t level; level = tf->tf_level; if (pil_stray_count[level] < MAX_STRAY_LOG) { printf("stray level interrupt %ld\n", level); pil_stray_count[level]++; if (pil_stray_count[level] >= MAX_STRAY_LOG) printf("got %d stray level interrupt %ld's: not " "logging anymore\n", MAX_STRAY_LOG, level); } } static void intr_stray_vector(void *cookie) { struct intr_vector *iv; u_int vec; iv = cookie; vec = iv->iv_vec; if (intr_stray_count[vec] < MAX_STRAY_LOG) { printf("stray vector interrupt %d\n", vec); intr_stray_count[vec]++; if (intr_stray_count[vec] >= MAX_STRAY_LOG) printf("got %d stray vector interrupt %d's: not " "logging anymore\n", MAX_STRAY_LOG, vec); } } void intr_init1() { int i; /* Mark all interrupts as being stray. */ for (i = 0; i < PIL_MAX; i++) intr_handlers[i] = intr_stray_level; for (i = 0; i < IV_MAX; i++) { intr_vectors[i].iv_func = intr_stray_vector; intr_vectors[i].iv_arg = &intr_vectors[i]; intr_vectors[i].iv_pri = PIL_LOW; intr_vectors[i].iv_vec = i; intr_vectors[i].iv_refcnt = 0; } intr_handlers[PIL_LOW] = intr_fast; } void intr_init2() { sx_init(&intr_table_lock, "intr sources"); mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN); } static int intr_assign_cpu(void *arg, int cpu) { #ifdef SMP struct pcpu *pc; struct intr_vector *iv; /* * Don't do anything during early boot. We will pick up the * assignment once the APs are started. */ if (assign_cpu && cpu != NOCPU) { pc = pcpu_find(cpu); if (pc == NULL) return (EINVAL); iv = arg; sx_xlock(&intr_table_lock); iv->iv_mid = pc->pc_mid; iv->iv_ic->ic_assign(iv); sx_xunlock(&intr_table_lock); } return (0); #else return (EOPNOTSUPP); #endif } static void intr_execute_handlers(void *cookie) { struct intr_vector *iv; iv = cookie; if (__predict_false(intr_event_handle(iv->iv_event, NULL) != 0)) intr_stray_vector(iv); } int intr_controller_register(int vec, const struct intr_controller *ic, void *icarg) { struct intr_event *ie; struct intr_vector *iv; int error; if (vec < 0 || vec >= IV_MAX) return (EINVAL); sx_xlock(&intr_table_lock); iv = &intr_vectors[vec]; ie = iv->iv_event; sx_xunlock(&intr_table_lock); if (ie != NULL) return (EEXIST); error = intr_event_create(&ie, iv, 0, vec, NULL, ic->ic_clear, ic->ic_clear, intr_assign_cpu, "vec%d:", vec); if (error != 0) return (error); sx_xlock(&intr_table_lock); if (iv->iv_event != NULL) { sx_xunlock(&intr_table_lock); intr_event_destroy(ie); return (EEXIST); } iv->iv_ic = ic; iv->iv_icarg = icarg; iv->iv_event = ie; iv->iv_mid = PCPU_GET(mid); sx_xunlock(&intr_table_lock); return (0); } int inthand_add(const char *name, int vec, driver_filter_t *filt, driver_intr_t *handler, void *arg, int flags, void **cookiep) { const struct intr_controller *ic; struct intr_event *ie; struct intr_handler *ih; struct intr_vector *iv; int error, filter; if (vec < 0 || vec >= IV_MAX) return (EINVAL); /* * INTR_BRIDGE filters/handlers are special purpose only, allowing * them to be shared just would complicate things unnecessarily. */ if ((flags & INTR_BRIDGE) != 0 && (flags & INTR_EXCL) == 0) return (EINVAL); sx_xlock(&intr_table_lock); iv = &intr_vectors[vec]; ic = iv->iv_ic; ie = iv->iv_event; sx_xunlock(&intr_table_lock); if (ic == NULL || ie == NULL) return (EINVAL); error = intr_event_add_handler(ie, name, filt, handler, arg, intr_priority(flags), flags, cookiep); if (error != 0) return (error); sx_xlock(&intr_table_lock); /* Disable the interrupt while we fiddle with it. */ ic->ic_disable(iv); iv->iv_refcnt++; if (iv->iv_refcnt == 1) intr_setup((flags & INTR_BRIDGE) != 0 ? PIL_BRIDGE : filt != NULL ? PIL_FILTER : PIL_ITHREAD, intr_fast, vec, intr_execute_handlers, iv); else if (filt != NULL) { /* * Check if we need to upgrade from PIL_ITHREAD to PIL_FILTER. * Given that apart from the on-board SCCs and UARTs shared * interrupts are rather uncommon on sparc64 this should be * pretty rare in practice. */ filter = 0; TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_filter != NULL && ih->ih_filter != filt) { filter = 1; break; } } if (filter == 0) intr_setup(PIL_FILTER, intr_fast, vec, intr_execute_handlers, iv); } intr_stray_count[vec] = 0; intrcnt_updatename(vec, ie->ie_fullname, 0); #ifdef SMP if (assign_cpu) intr_assign_next_cpu(iv); #endif ic->ic_enable(iv); /* Ensure the interrupt is cleared, it might have triggered before. */ if (ic->ic_clear != NULL) ic->ic_clear(iv); sx_xunlock(&intr_table_lock); return (0); } int inthand_remove(int vec, void *cookie) { struct intr_vector *iv; int error; if (vec < 0 || vec >= IV_MAX) return (EINVAL); error = intr_event_remove_handler(cookie); if (error == 0) { /* * XXX: maybe this should be done regardless of whether * intr_event_remove_handler() succeeded? */ sx_xlock(&intr_table_lock); iv = &intr_vectors[vec]; iv->iv_refcnt--; if (iv->iv_refcnt == 0) { /* * Don't disable the interrupt for now, so that * stray interrupts get detected... */ intr_setup(PIL_LOW, intr_fast, vec, intr_stray_vector, iv); } sx_xunlock(&intr_table_lock); } return (error); } /* Add a description to an active interrupt handler. */ int intr_describe(int vec, void *ih, const char *descr) { struct intr_vector *iv; int error; if (vec < 0 || vec >= IV_MAX) return (EINVAL); sx_xlock(&intr_table_lock); iv = &intr_vectors[vec]; if (iv == NULL) { sx_xunlock(&intr_table_lock); return (EINVAL); } error = intr_event_describe_handler(iv->iv_event, ih, descr); if (error) { sx_xunlock(&intr_table_lock); return (error); } intrcnt_updatename(vec, iv->iv_event->ie_fullname, 0); sx_xunlock(&intr_table_lock); return (error); +} + +/* + * Do VM_CNT_INC(intr), being in the interrupt context already. This is + * called from assembly. + * To avoid counter_enter() and appropriate assertion, unwrap VM_CNT_INC() + * and hardcode the actual increment. + */ +void +counter_intr_inc(void) +{ + + *(uint64_t *)zpcpu_get(vm_cnt.v_intr) += 1; } #ifdef SMP /* * Support for balancing interrupt sources across CPUs. For now we just * allocate CPUs round-robin. */ static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1); static int current_cpu; static void intr_assign_next_cpu(struct intr_vector *iv) { struct pcpu *pc; sx_assert(&intr_table_lock, SA_XLOCKED); /* * Assign this source to a CPU in a round-robin fashion. */ pc = pcpu_find(current_cpu); if (pc == NULL) return; iv->iv_mid = pc->pc_mid; iv->iv_ic->ic_assign(iv); do { current_cpu++; if (current_cpu > mp_maxid) current_cpu = 0; } while (!CPU_ISSET(current_cpu, &intr_cpus)); } /* Attempt to bind the specified IRQ to the specified CPU. */ int intr_bind(int vec, u_char cpu) { struct intr_vector *iv; int error; if (vec < 0 || vec >= IV_MAX) return (EINVAL); sx_xlock(&intr_table_lock); iv = &intr_vectors[vec]; if (iv == NULL) { sx_xunlock(&intr_table_lock); return (EINVAL); } error = intr_event_bind(iv->iv_event, cpu); sx_xunlock(&intr_table_lock); return (error); } /* * Add a CPU to our mask of valid CPUs that can be destinations of * interrupts. */ void intr_add_cpu(u_int cpu) { if (cpu >= MAXCPU) panic("%s: Invalid CPU ID", __func__); if (bootverbose) printf("INTR: Adding CPU %d as a target\n", cpu); CPU_SET(cpu, &intr_cpus); } /* * Distribute all the interrupt sources among the available CPUs once the * APs have been launched. */ static void intr_shuffle_irqs(void *arg __unused) { struct pcpu *pc; struct intr_vector *iv; int i; /* Don't bother on UP. */ if (mp_ncpus == 1) return; sx_xlock(&intr_table_lock); assign_cpu = 1; for (i = 0; i < IV_MAX; i++) { iv = &intr_vectors[i]; if (iv != NULL && iv->iv_refcnt > 0) { /* * If this event is already bound to a CPU, * then assign the source to that CPU instead * of picking one via round-robin. */ if (iv->iv_event->ie_cpu != NOCPU && (pc = pcpu_find(iv->iv_event->ie_cpu)) != NULL) { iv->iv_mid = pc->pc_mid; iv->iv_ic->ic_assign(iv); } else intr_assign_next_cpu(iv); } } sx_xunlock(&intr_table_lock); } SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); #endif Index: head/sys/sparc64/sparc64/machdep.c =================================================================== --- head/sys/sparc64/sparc64/machdep.c (revision 317060) +++ head/sys/sparc64/sparc64/machdep.c (revision 317061) @@ -1,1113 +1,1114 @@ /*- * Copyright (c) 2001 Jake Burkholder. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef int ofw_vec_t(void *); int dtlb_slots; int itlb_slots; struct tlb_entry *kernel_tlbs; int kernel_tlb_slots; int cold = 1; long Maxmem; long realmem; void *dpcpu0; char pcpu0[PCPU_PAGES * PAGE_SIZE]; +struct pcpu dummy_pcpu[MAXCPU]; struct trapframe frame0; vm_offset_t kstack0; vm_paddr_t kstack0_phys; struct kva_md_info kmi; u_long ofw_vec; u_long ofw_tba; u_int tba_taken_over; char sparc64_model[32]; static int cpu_use_vis = 1; cpu_block_copy_t *cpu_block_copy; cpu_block_zero_t *cpu_block_zero; static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl); void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec); static void sparc64_shutdown_final(void *dummy, int howto); static void cpu_startup(void *arg); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); CTASSERT((1 << INT_SHIFT) == sizeof(int)); CTASSERT((1 << PTR_SHIFT) == sizeof(char *)); CTASSERT(sizeof(struct reg) == 256); CTASSERT(sizeof(struct fpreg) == 272); CTASSERT(sizeof(struct __mcontext) == 512); CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0); CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8)); CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2)); static void cpu_startup(void *arg) { vm_paddr_t physsz; int i; physsz = 0; for (i = 0; i < sparc64_nmemreg; i++) physsz += sparc64_memreg[i].mr_size; printf("real memory = %lu (%lu MB)\n", physsz, physsz / (1024 * 1024)); realmem = (long)physsz / PAGE_SIZE; vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL, SHUTDOWN_PRI_LAST); printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE, vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE)); if (bootverbose) printf("machine: %s\n", sparc64_model); cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { struct intr_request *ir; int i; pcpu->pc_irtail = &pcpu->pc_irhead; for (i = 0; i < IR_FREE; i++) { ir = &pcpu->pc_irpool[i]; ir->ir_next = pcpu->pc_irfree; pcpu->pc_irfree = ir; } } void spinlock_enter(void) { struct thread *td; register_t pil; td = curthread; if (td->td_md.md_spinlock_count == 0) { pil = rdpr(pil); wrpr(pil, 0, PIL_TICK); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_pil = pil; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t pil; td = curthread; critical_exit(); pil = td->td_md.md_saved_pil; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) wrpr(pil, pil, 0); } static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl) { char type[sizeof("cpu")]; phandle_t child; uint32_t portid; for (; node != 0; node = OF_peer(node)) { child = OF_child(node); if (child > 0) { child = find_bsp(child, bspid, cpu_impl); if (child > 0) return (child); } else { if (OF_getprop(node, "device_type", type, sizeof(type)) <= 0) continue; if (strcmp(type, "cpu") != 0) continue; if (OF_getprop(node, cpu_portid_prop(cpu_impl), &portid, sizeof(portid)) <= 0) continue; if (portid == bspid) return (node); } } return (0); } const char * cpu_portid_prop(u_int cpu_impl) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_SPARC64V: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: return ("upa-portid"); case CPU_IMPL_ULTRASPARCIII: case CPU_IMPL_ULTRASPARCIIIp: case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIIIip: return ("portid"); case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: return ("cpuid"); default: return (""); } } uint32_t cpu_get_mid(u_int cpu_impl) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_SPARC64V: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIII: case CPU_IMPL_ULTRASPARCIIIp: return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG, ASI_FIREPLANE_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIIIip: return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG))); case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID))); default: return (0); } } void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec) { char *env; struct pcpu *pc; vm_offset_t end; vm_offset_t va; caddr_t kmdp; phandle_t root; u_int cpu_impl; end = 0; kmdp = NULL; /* * Find out what kind of CPU we have first, for anything that changes * behaviour. */ cpu_impl = VER_IMPL(rdpr(ver)); /* * Do CPU-specific initialization. */ if (cpu_impl >= CPU_IMPL_ULTRASPARCIII) cheetah_init(cpu_impl); else if (cpu_impl == CPU_IMPL_SPARC64V) zeus_init(cpu_impl); /* * Clear (S)TICK timer (including NPT). */ tick_clear(cpu_impl); /* * UltraSparc II[e,i] based systems come up with the tick interrupt * enabled and a handler that resets the tick counter, causing DELAY() * to not work properly when used early in boot. * UltraSPARC III based systems come up with the system tick interrupt * enabled, causing an interrupt storm on startup since they are not * handled. */ tick_stop(cpu_impl); /* * Set up Open Firmware entry points. */ ofw_tba = rdpr(tba); ofw_vec = (u_long)vec; /* * Parse metadata if present and fetch parameters. Must be before the * console is inited so cninit() gets the right value of boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS, int); kernel_tlbs = (void *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_DTLB); } } init_param1(); /* * Initialize Open Firmware (needed for console). */ OF_install(OFW_STD_DIRECT, 0); OF_init(ofw_entry); /* * Prime our per-CPU data page for use. Note, we are using it for * our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init * or it'll zero it out from under us. */ pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_addr = (vm_offset_t)pcpu0; pc->pc_impl = cpu_impl; pc->pc_mid = cpu_get_mid(cpu_impl); pc->pc_tlb_ctx = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX; /* * Determine the OFW node and frequency of the BSP (and ensure the * BSP is in the device tree in the first place). */ root = OF_peer(0); pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl); if (pc->pc_node == 0) OF_panic("%s: cannot find boot CPU node", __func__); if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock, sizeof(pc->pc_clock)) <= 0) OF_panic("%s: cannot determine boot CPU clock", __func__); /* * Panic if there is no metadata. Most likely the kernel was booted * directly, instead of through loader(8). */ if (mdp == NULL || kmdp == NULL || end == 0 || kernel_tlb_slots == 0 || kernel_tlbs == NULL) OF_panic("%s: missing loader metadata.\nThis probably means " "you are not using loader(8).", __func__); /* * Work around the broken loader behavior of not demapping no * longer used kernel TLB slots when unloading the kernel or * modules. */ for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M; va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) { if (bootverbose) OF_printf("demapping unused kernel TLB slot " "(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1); stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE, ASI_DMMU_DEMAP, 0); stxa(TLB_DEMAP_VA(va) | TLB_DEMAP_PRIMARY | TLB_DEMAP_PAGE, ASI_IMMU_DEMAP, 0); flush(KERNBASE); kernel_tlb_slots--; } /* * Determine the TLB slot maxima, which are expected to be * equal across all CPUs. * NB: for cheetah-class CPUs, these properties only refer * to the t16s. */ if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots, sizeof(dtlb_slots)) == -1) OF_panic("%s: cannot determine number of dTLB slots", __func__); if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots, sizeof(itlb_slots)) == -1) OF_panic("%s: cannot determine number of iTLB slots", __func__); /* * Initialize and enable the caches. Note that this may include * applying workarounds. */ cache_init(pc); cache_enable(cpu_impl); uma_set_align(pc->pc_cache.dc_linesize - 1); cpu_block_copy = bcopy; cpu_block_zero = bzero; getenv_int("machdep.use_vis", &cpu_use_vis); if (cpu_use_vis) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: case CPU_IMPL_ULTRASPARCIII: /* NB: we've disabled P$. */ case CPU_IMPL_ULTRASPARCIIIp: case CPU_IMPL_ULTRASPARCIIIi: case CPU_IMPL_ULTRASPARCIV: case CPU_IMPL_ULTRASPARCIVp: case CPU_IMPL_ULTRASPARCIIIip: cpu_block_copy = spitfire_block_copy; cpu_block_zero = spitfire_block_zero; break; case CPU_IMPL_SPARC64V: cpu_block_copy = zeus_block_copy; cpu_block_zero = zeus_block_zero; break; } } #ifdef SMP mp_init(); #endif /* * Initialize virtual memory and calculate physmem. */ pmap_bootstrap(cpu_impl); /* * Initialize tunables. */ init_param2(physmem); env = kern_getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Initialize the interrupt tables. */ intr_init1(); /* * Initialize proc0, set kstack0, frame0, curthread and curpcb. */ proc_linkup0(&proc0, &thread0); proc0.p_md.md_sigtramp = NULL; proc0.p_md.md_utrap = NULL; thread0.td_kstack = kstack0; thread0.td_kstack_pages = KSTACK_PAGES; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV; thread0.td_frame = &frame0; pc->pc_curthread = &thread0; pc->pc_curpcb = thread0.td_pcb; /* * Initialize global registers. */ cpu_setregs(pc); /* * Take over the trap table via the PROM. Using the PROM for this * is necessary in order to set obp-control-relinquished to true * within the PROM so obtaining /virtual-memory/translations doesn't * trigger a fatal reset error or worse things further down the road. * XXX it should be possible to use this solely instead of writing * %tba in cpu_setregs(). Doing so causes a hang however. * * NB: the low-level console drivers require a working DELAY() and * some compiler optimizations may cause the curthread accesses of * mutex(9) to be factored out even if the latter aren't actually * called. Both of these require PCPU_REG to be set. However, we * can't set PCPU_REG without also taking over the trap table or the * firmware will overwrite it. */ sun4u_set_traptable(tl0_base); /* * Initialize the dynamic per-CPU area for the BSP and the message * buffer (after setting the trap table). */ dpcpu_init(dpcpu0, 0); msgbufinit(msgbufp, msgbufsize); /* * Initialize mutexes. */ mutex_init(); /* * Initialize console now that we have a reasonable set of system * services. */ cninit(); /* * Finish the interrupt initialization now that mutexes work and * enable them. */ intr_init2(); wrpr(pil, 0, 0); wrpr(pstate, 0, PSTATE_KERNEL); OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct frame *fp; struct proc *p; u_long sp; int oonstack; int sig; oonstack = 0; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; sp = tf->tf_sp + SPOFF; oonstack = sigonstack(sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Make sure we have a signal trampoline to return to. */ if (p->p_md.md_sigtramp == NULL) { /* * No signal trampoline... kill the process. */ CTR0(KTR_SIG, "sendsig: no sigtramp"); printf("sendsig: %s is too old, rebuild it\n", p->p_comm); sigexit(td, sig); /* NOTREACHED */ } /* Save user context. */ bzero(&sf, sizeof(sf)); get_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe)); } else sfp = (struct sigframe *)sp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); fp = (struct frame *)sfp - 1; /* Build the argument list for the signal handler. */ tf->tf_out[0] = sig; tf->tf_out[2] = (register_t)&sfp->sf_uc; tf->tf_out[4] = (register_t)catcher; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ tf->tf_out[1] = (register_t)&sfp->sf_si; /* Fill in POSIX parts. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ tf->tf_out[1] = ksi->ksi_code; tf->tf_out[3] = (register_t)ksi->ksi_addr; } /* Copy the sigframe out to the user's stack. */ if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 || suword(&fp->fr_in[6], tf->tf_out[6]) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); /* NOTREACHED */ } tf->tf_tpc = (u_long)p->p_md.md_sigtramp; tf->tf_tnpc = tf->tf_tpc + 4; tf->tf_sp = (u_long)fp - SPOFF; CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif /* * MPSAFE */ int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; mcontext_t *mc; ucontext_t uc; int error; p = td->td_proc; if (rwindow_save(td)) { PROC_LOCK(p); sigexit(td, SIGILL); } CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } mc = &uc.uc_mcontext; error = set_mcontext(td, mc); if (error != 0) return (error); kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx", td, mc->_mc_tpc, mc->_mc_sp, mc->_mc_tstate); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_pc = tf->tf_tpc; pcb->pcb_sp = tf->tf_sp; } int get_mcontext(struct thread *td, mcontext_t *mc, int flags) { struct trapframe *tf; struct pcb *pcb; tf = td->td_frame; pcb = td->td_pcb; /* * Copy the registers which will be restored by tl0_ret() from the * trapframe. * Note that we skip %g7 which is used as the userland TLS register * and %wstate. */ mc->_mc_flags = _MC_VERSION; mc->mc_global[1] = tf->tf_global[1]; mc->mc_global[2] = tf->tf_global[2]; mc->mc_global[3] = tf->tf_global[3]; mc->mc_global[4] = tf->tf_global[4]; mc->mc_global[5] = tf->tf_global[5]; mc->mc_global[6] = tf->tf_global[6]; if (flags & GET_MC_CLEAR_RET) { mc->mc_out[0] = 0; mc->mc_out[1] = 0; } else { mc->mc_out[0] = tf->tf_out[0]; mc->mc_out[1] = tf->tf_out[1]; } mc->mc_out[2] = tf->tf_out[2]; mc->mc_out[3] = tf->tf_out[3]; mc->mc_out[4] = tf->tf_out[4]; mc->mc_out[5] = tf->tf_out[5]; mc->mc_out[6] = tf->tf_out[6]; mc->mc_out[7] = tf->tf_out[7]; mc->_mc_fprs = tf->tf_fprs; mc->_mc_fsr = tf->tf_fsr; mc->_mc_gsr = tf->tf_gsr; mc->_mc_tnpc = tf->tf_tnpc; mc->_mc_tpc = tf->tf_tpc; mc->_mc_tstate = tf->tf_tstate; mc->_mc_y = tf->tf_y; critical_enter(); if ((tf->tf_fprs & FPRS_FEF) != 0) { savefpctx(pcb->pcb_ufp); tf->tf_fprs &= ~FPRS_FEF; pcb->pcb_flags |= PCB_FEF; } if ((pcb->pcb_flags & PCB_FEF) != 0) { bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp)); mc->_mc_fprs |= FPRS_FEF; } critical_exit(); return (0); } int set_mcontext(struct thread *td, mcontext_t *mc) { struct trapframe *tf; struct pcb *pcb; if (!TSTATE_SECURE(mc->_mc_tstate) || (mc->_mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION) return (EINVAL); tf = td->td_frame; pcb = td->td_pcb; /* Make sure the windows are spilled first. */ flushw(); /* * Copy the registers which will be restored by tl0_ret() to the * trapframe. * Note that we skip %g7 which is used as the userland TLS register * and %wstate. */ tf->tf_global[1] = mc->mc_global[1]; tf->tf_global[2] = mc->mc_global[2]; tf->tf_global[3] = mc->mc_global[3]; tf->tf_global[4] = mc->mc_global[4]; tf->tf_global[5] = mc->mc_global[5]; tf->tf_global[6] = mc->mc_global[6]; tf->tf_out[0] = mc->mc_out[0]; tf->tf_out[1] = mc->mc_out[1]; tf->tf_out[2] = mc->mc_out[2]; tf->tf_out[3] = mc->mc_out[3]; tf->tf_out[4] = mc->mc_out[4]; tf->tf_out[5] = mc->mc_out[5]; tf->tf_out[6] = mc->mc_out[6]; tf->tf_out[7] = mc->mc_out[7]; tf->tf_fprs = mc->_mc_fprs; tf->tf_fsr = mc->_mc_fsr; tf->tf_gsr = mc->_mc_gsr; tf->tf_tnpc = mc->_mc_tnpc; tf->tf_tpc = mc->_mc_tpc; tf->tf_tstate = mc->_mc_tstate; tf->tf_y = mc->_mc_y; if ((mc->_mc_fprs & FPRS_FEF) != 0) { tf->tf_fprs = 0; bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); pcb->pcb_flags |= PCB_FEF; } return (0); } /* * Exit the kernel and execute a firmware call that will not return, as * specified by the arguments. */ void cpu_shutdown(void *args) { #ifdef SMP cpu_mp_shutdown(); #endif ofw_exit(args); } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { struct pcpu *pc; pc = pcpu_find(cpu_id); if (pc == NULL || rate == NULL) return (EINVAL); *rate = pc->pc_clock; return (0); } /* * Duplicate OF_exit() with a different firmware call function that restores * the trap table, otherwise a RED state exception is triggered in at least * some firmware versions. */ void cpu_halt(void) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"exit", 0, 0 }; cpu_shutdown(&args); } static void sparc64_shutdown_final(void *dummy, int howto) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"SUNW,power-off", 0, 0 }; /* Turn the power off? */ if ((howto & RB_POWEROFF) != 0) cpu_shutdown(&args); /* In case of halt, return to the firmware. */ if ((howto & RB_HALT) != 0) cpu_halt(); } void cpu_idle(int busy) { /* Insert code to halt (until next interrupt) for the idle loop. */ } int cpu_idle_wakeup(int cpu) { return (1); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_tpc = addr; td->td_frame->tf_tnpc = addr + 4; return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; struct pcb *pcb; struct proc *p; u_long sp; /* XXX no cpu_exec */ p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } pcb = td->td_pcb; tf = td->td_frame; sp = rounddown(stack, 16); bzero(pcb, sizeof(*pcb)); bzero(tf, sizeof(*tf)); tf->tf_out[0] = stack; tf->tf_out[3] = p->p_sysent->sv_psstrings; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tnpc = imgp->entry_addr + 4; tf->tf_tpc = imgp->entry_addr; /* * While we could adhere to the memory model indicated in the ELF * header, it turns out that just always using TSO performs best. */ tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO; td->td_retval[0] = tf->tf_out[0]; td->td_retval[1] = tf->tf_out[1]; } int fill_regs(struct thread *td, struct reg *regs) { bcopy(td->td_frame, regs, sizeof(*regs)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; if (!TSTATE_SECURE(regs->r_tstate)) return (EINVAL); tf = td->td_frame; regs->r_wstate = tf->tf_wstate; bcopy(regs, tf, sizeof(*regs)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs)); fpregs->fr_fsr = tf->tf_fsr; fpregs->fr_gsr = tf->tf_gsr; return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; tf->tf_fprs &= ~FPRS_FEF; bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); tf->tf_fsr = fpregs->fr_fsr; tf->tf_gsr = fpregs->fr_gsr; return (0); } struct md_utrap * utrap_alloc(void) { struct md_utrap *ut; ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO); ut->ut_refcnt = 1; return (ut); } void utrap_free(struct md_utrap *ut) { int refcnt; if (ut == NULL) return; mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt--; refcnt = ut->ut_refcnt; mtx_pool_unlock(mtxpool_sleep, ut); if (refcnt == 0) free(ut, M_SUBPROC); } struct md_utrap * utrap_hold(struct md_utrap *ut) { if (ut == NULL) return (NULL); mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt++; mtx_pool_unlock(mtxpool_sleep, ut); return (ut); } Index: head/sys/sparc64/sparc64/trap.c =================================================================== --- head/sys/sparc64/sparc64/trap.c (revision 317060) +++ head/sys/sparc64/sparc64/trap.c (revision 317061) @@ -1,617 +1,617 @@ /*- * Copyright (c) 2001, Jake Burkholder * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * from: FreeBSD: src/sys/i386/i386/trap.c,v 1.197 2001/07/19 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktr.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void trap(struct trapframe *tf); void syscall(struct trapframe *tf); static int trap_cecc(void); static int trap_pfault(struct thread *td, struct trapframe *tf); extern char copy_fault[]; extern char copy_nofault_begin[]; extern char copy_nofault_end[]; extern char fs_fault[]; extern char fs_nofault_begin[]; extern char fs_nofault_end[]; extern char fs_nofault_intr_begin[]; extern char fs_nofault_intr_end[]; extern char fas_fault[]; extern char fas_nofault_begin[]; extern char fas_nofault_end[]; const char *const trap_msg[] = { "reserved", "instruction access exception", "instruction access error", "instruction access protection", "illtrap instruction", "illegal instruction", "privileged opcode", "floating point disabled", "floating point exception ieee 754", "floating point exception other", "tag overflow", "division by zero", "data access exception", "data access error", "data access protection", "memory address not aligned", "privileged action", "async data error", "trap instruction 16", "trap instruction 17", "trap instruction 18", "trap instruction 19", "trap instruction 20", "trap instruction 21", "trap instruction 22", "trap instruction 23", "trap instruction 24", "trap instruction 25", "trap instruction 26", "trap instruction 27", "trap instruction 28", "trap instruction 29", "trap instruction 30", "trap instruction 31", "fast instruction access mmu miss", "fast data access mmu miss", "interrupt", "physical address watchpoint", "virtual address watchpoint", "corrected ecc error", "spill", "fill", "fill", "breakpoint", "clean window", "range check", "fix alignment", "integer overflow", "syscall", "restore physical watchpoint", "restore virtual watchpoint", "kernel stack fault", }; static const int trap_sig[] = { SIGILL, /* reserved */ SIGILL, /* instruction access exception */ SIGILL, /* instruction access error */ SIGILL, /* instruction access protection */ SIGILL, /* illtrap instruction */ SIGILL, /* illegal instruction */ SIGBUS, /* privileged opcode */ SIGFPE, /* floating point disabled */ SIGFPE, /* floating point exception ieee 754 */ SIGFPE, /* floating point exception other */ SIGEMT, /* tag overflow */ SIGFPE, /* division by zero */ SIGILL, /* data access exception */ SIGILL, /* data access error */ SIGBUS, /* data access protection */ SIGBUS, /* memory address not aligned */ SIGBUS, /* privileged action */ SIGBUS, /* async data error */ SIGILL, /* trap instruction 16 */ SIGILL, /* trap instruction 17 */ SIGILL, /* trap instruction 18 */ SIGILL, /* trap instruction 19 */ SIGILL, /* trap instruction 20 */ SIGILL, /* trap instruction 21 */ SIGILL, /* trap instruction 22 */ SIGILL, /* trap instruction 23 */ SIGILL, /* trap instruction 24 */ SIGILL, /* trap instruction 25 */ SIGILL, /* trap instruction 26 */ SIGILL, /* trap instruction 27 */ SIGILL, /* trap instruction 28 */ SIGILL, /* trap instruction 29 */ SIGILL, /* trap instruction 30 */ SIGILL, /* trap instruction 31 */ SIGSEGV, /* fast instruction access mmu miss */ SIGSEGV, /* fast data access mmu miss */ -1, /* interrupt */ -1, /* physical address watchpoint */ -1, /* virtual address watchpoint */ -1, /* corrected ecc error */ SIGILL, /* spill */ SIGILL, /* fill */ SIGILL, /* fill */ SIGTRAP, /* breakpoint */ SIGILL, /* clean window */ SIGILL, /* range check */ SIGILL, /* fix alignment */ SIGILL, /* integer overflow */ SIGSYS, /* syscall */ -1, /* restore physical watchpoint */ -1, /* restore virtual watchpoint */ -1, /* kernel stack fault */ }; CTASSERT(nitems(trap_msg) == T_MAX); CTASSERT(nitems(trap_sig) == T_MAX); CTASSERT(sizeof(struct trapframe) == 256); int debugger_on_signal = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_signal, CTLFLAG_RW, &debugger_on_signal, 0, ""); u_int corrected_ecc = 0; SYSCTL_UINT(_machdep, OID_AUTO, corrected_ecc, CTLFLAG_RD, &corrected_ecc, 0, "corrected ECC errors"); /* * SUNW,set-trap-table allows to take over %tba from the PROM, which * will turn off interrupts and handle outstanding ones while doing so, * in a safe way. */ void sun4u_set_traptable(void *tba_addr) { static struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t tba_addr; } args = { (cell_t)"SUNW,set-trap-table", 1, 0, }; args.tba_addr = (cell_t)tba_addr; ofw_entry(&args); } void trap(struct trapframe *tf) { struct thread *td; struct proc *p; int error; int sig; register_t addr; ksiginfo_t ksi; td = curthread; CTR4(KTR_TRAP, "trap: %p type=%s (%s) pil=%#lx", td, trap_msg[tf->tf_type & ~T_KERNEL], (TRAPF_USERMODE(tf) ? "user" : "kernel"), rdpr(pil)); - PCPU_INC(cnt.v_trap); + VM_CNT_INC(v_trap); if ((tf->tf_tstate & TSTATE_PRIV) == 0) { KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); p = td->td_proc; td->td_pticks = 0; td->td_frame = tf; addr = tf->tf_tpc; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (tf->tf_type) { case T_DATA_MISS: case T_DATA_PROTECTION: addr = tf->tf_sfar; /* FALLTHROUGH */ case T_INSTRUCTION_MISS: sig = trap_pfault(td, tf); break; case T_FILL: sig = rwindow_load(td, tf, 2); break; case T_FILL_RET: sig = rwindow_load(td, tf, 1); break; case T_SPILL: sig = rwindow_save(td); break; case T_CORRECTED_ECC_ERROR: sig = trap_cecc(); break; default: if (tf->tf_type > T_MAX) panic("trap: bad trap type %#lx (user)", tf->tf_type); else if (trap_sig[tf->tf_type] == -1) panic("trap: %s (user)", trap_msg[tf->tf_type]); sig = trap_sig[tf->tf_type]; break; } if (sig != 0) { /* Translate fault for emulators. */ if (p->p_sysent->sv_transtrap != NULL) { sig = p->p_sysent->sv_transtrap(sig, tf->tf_type); } if (debugger_on_signal && (sig == 4 || sig == 10 || sig == 11)) kdb_enter(KDB_WHY_TRAPSIG, "trapsig"); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int)tf->tf_type; /* XXX not POSIX */ ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = (int)tf->tf_type; trapsignal(td, &ksi); } userret(td, tf); } else { KASSERT((tf->tf_type & T_KERNEL) != 0, ("trap: kernel trap isn't")); if (kdb_active) { kdb_reenter(); return; } switch (tf->tf_type & ~T_KERNEL) { case T_BREAKPOINT: case T_KSTACK_FAULT: error = (kdb_trap(tf->tf_type, 0, tf) == 0); TF_DONE(tf); break; #ifdef notyet case T_PA_WATCHPOINT: case T_VA_WATCHPOINT: error = db_watch_trap(tf); break; #endif case T_DATA_MISS: case T_DATA_PROTECTION: case T_INSTRUCTION_MISS: error = trap_pfault(td, tf); break; case T_DATA_EXCEPTION: case T_MEM_ADDRESS_NOT_ALIGNED: if ((tf->tf_sfsr & MMU_SFSR_FV) != 0 && MMU_SFSR_GET_ASI(tf->tf_sfsr) == ASI_AIUP) { if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } } error = 1; break; case T_DATA_ERROR: /* * Handle PCI poke/peek as per UltraSPARC IIi * User's Manual 16.2.1, modulo checking the * TPC as USIII CPUs generate a precise trap * instead of a special deferred one. */ if (tf->tf_tpc > (u_long)fas_nofault_begin && tf->tf_tpc < (u_long)fas_nofault_end) { cache_flush(); cache_enable(PCPU_GET(impl)); tf->tf_tpc = (u_long)fas_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } error = 1; break; case T_CORRECTED_ECC_ERROR: error = trap_cecc(); break; default: error = 1; break; } if (error != 0) { tf->tf_type &= ~T_KERNEL; if (tf->tf_type > T_MAX) panic("trap: bad trap type %#lx (kernel)", tf->tf_type); panic("trap: %s (kernel)", trap_msg[tf->tf_type]); } } CTR1(KTR_TRAP, "trap: td=%p return", td); } static int trap_cecc(void) { u_long eee; /* * Turn off (non-)correctable error reporting while we're dealing * with the error. */ eee = ldxa(0, ASI_ESTATE_ERROR_EN_REG); stxa_sync(0, ASI_ESTATE_ERROR_EN_REG, eee & ~(AA_ESTATE_NCEEN | AA_ESTATE_CEEN)); /* Flush the caches in order ensure no corrupt data got installed. */ cache_flush(); /* Ensure the caches are still turned on (should be). */ cache_enable(PCPU_GET(impl)); /* Clear the error from the AFSR. */ stxa_sync(0, ASI_AFSR, ldxa(0, ASI_AFSR)); corrected_ecc++; printf("corrected ECC error\n"); /* Turn (non-)correctable error reporting back on. */ stxa_sync(0, ASI_ESTATE_ERROR_EN_REG, eee); return (0); } static int trap_pfault(struct thread *td, struct trapframe *tf) { vm_map_t map; struct proc *p; vm_offset_t va; vm_prot_t prot; vm_map_entry_t entry; u_long ctx; int type; int rv; if (td == NULL) return (-1); KASSERT(td->td_pcb != NULL, ("trap_pfault: pcb NULL")); KASSERT(td->td_proc != NULL, ("trap_pfault: curproc NULL")); KASSERT(td->td_proc->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); p = td->td_proc; rv = KERN_SUCCESS; ctx = TLB_TAR_CTX(tf->tf_tar); type = tf->tf_type & ~T_KERNEL; va = TLB_TAR_VA(tf->tf_tar); CTR4(KTR_TRAP, "trap_pfault: td=%p pm_ctx=%#lx va=%#lx ctx=%#lx", td, p->p_vmspace->vm_pmap.pm_context[curcpu], va, ctx); if (type == T_DATA_PROTECTION) prot = VM_PROT_WRITE; else { if (type == T_DATA_MISS) prot = VM_PROT_READ; else prot = VM_PROT_READ | VM_PROT_EXECUTE; } if (ctx != TLB_CTX_KERNEL) { if ((tf->tf_tstate & TSTATE_PRIV) != 0 && (tf->tf_tpc >= (u_long)fs_nofault_intr_begin && tf->tf_tpc <= (u_long)fs_nofault_intr_end)) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } /* This is a fault on non-kernel virtual memory. */ map = &p->p_vmspace->vm_map; } else { /* * This is a fault on kernel virtual memory. Attempts to * access kernel memory from user mode cause privileged * action traps, not page fault. */ KASSERT(tf->tf_tstate & TSTATE_PRIV, ("trap_pfault: fault on nucleus context from user mode")); if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { vm_map_lock_read(kernel_map); if (vm_map_lookup_entry(kernel_map, va, &entry) && (entry->eflags & MAP_ENTRY_NOFAULT) != 0) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; vm_map_unlock_read(kernel_map); return (0); } vm_map_unlock_read(kernel_map); } map = kernel_map; } /* Fault in the page. */ rv = vm_fault(map, va, prot, VM_FAULT_NORMAL); CTR3(KTR_TRAP, "trap_pfault: return td=%p va=%#lx rv=%d", td, va, rv); if (rv == KERN_SUCCESS) return (0); if (ctx != TLB_CTX_KERNEL && (tf->tf_tstate & TSTATE_PRIV) != 0) { if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } /* Maximum number of arguments that can be passed via the out registers. */ #define REG_MAXARGS 6 int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct trapframe *tf; struct proc *p; register_t *argp; int reg; int regcnt; int error; p = td->td_proc; tf = td->td_frame; reg = 0; regcnt = REG_MAXARGS; sa->code = tf->tf_global[1]; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = tf->tf_out[reg++]; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = sa->args; bcopy(&tf->tf_out[reg], sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) error = copyin((void *)(tf->tf_out[6] + SPOFF + offsetof(struct frame, fr_pad[6])), &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; } return (error); } #include "../../kern/subr_syscall.c" /* * Syscall handler * The arguments to the syscall are passed in the out registers by the caller, * and are saved in the trap frame. The syscall number is passed in %g1 (and * also saved in the trap frame). */ void syscall(struct trapframe *tf) { struct thread *td; struct syscall_args sa; int error; td = curthread; td->td_frame = tf; KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); /* * For syscalls, we don't want to retry the faulting instruction * (usually), instead we need to advance one instruction. */ td->td_pcb->pcb_tpc = tf->tf_tpc; TF_DONE(tf); error = syscallenter(td, &sa); syscallret(td, error, &sa); } Index: head/sys/sys/pcpu.h =================================================================== --- head/sys/sys/pcpu.h (revision 317060) +++ head/sys/sys/pcpu.h (revision 317061) @@ -1,239 +1,238 @@ /*- * Copyright (c) 2001 Wind River Systems, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_PCPU_H_ #define _SYS_PCPU_H_ #ifdef LOCORE #error "no assembler-serviceable parts inside" #endif #include #include #include #include #include #include -#include #include #include #define DPCPU_SETNAME "set_pcpu" #define DPCPU_SYMPREFIX "pcpu_entry_" #ifdef _KERNEL /* * Define a set for pcpu data. */ extern uintptr_t *__start_set_pcpu; __GLOBL(__start_set_pcpu); extern uintptr_t *__stop_set_pcpu; __GLOBL(__stop_set_pcpu); /* * Array of dynamic pcpu base offsets. Indexed by id. */ extern uintptr_t dpcpu_off[]; /* * Convenience defines. */ #define DPCPU_START ((uintptr_t)&__start_set_pcpu) #define DPCPU_STOP ((uintptr_t)&__stop_set_pcpu) #define DPCPU_BYTES (DPCPU_STOP - DPCPU_START) #define DPCPU_MODMIN 2048 #define DPCPU_SIZE roundup2(DPCPU_BYTES, PAGE_SIZE) #define DPCPU_MODSIZE (DPCPU_SIZE - (DPCPU_BYTES - DPCPU_MODMIN)) /* * Declaration and definition. */ #define DPCPU_NAME(n) pcpu_entry_##n #define DPCPU_DECLARE(t, n) extern t DPCPU_NAME(n) #define DPCPU_DEFINE(t, n) t DPCPU_NAME(n) __section(DPCPU_SETNAME) __used /* * Accessors with a given base. */ #define _DPCPU_PTR(b, n) \ (__typeof(DPCPU_NAME(n))*)((b) + (uintptr_t)&DPCPU_NAME(n)) #define _DPCPU_GET(b, n) (*_DPCPU_PTR(b, n)) #define _DPCPU_SET(b, n, v) (*_DPCPU_PTR(b, n) = v) /* * Accessors for the current cpu. */ #define DPCPU_PTR(n) _DPCPU_PTR(PCPU_GET(dynamic), n) #define DPCPU_GET(n) (*DPCPU_PTR(n)) #define DPCPU_SET(n, v) (*DPCPU_PTR(n) = v) /* * Accessors for remote cpus. */ #define DPCPU_ID_PTR(i, n) _DPCPU_PTR(dpcpu_off[(i)], n) #define DPCPU_ID_GET(i, n) (*DPCPU_ID_PTR(i, n)) #define DPCPU_ID_SET(i, n, v) (*DPCPU_ID_PTR(i, n) = v) /* * Utility macros. */ #define DPCPU_SUM(n) __extension__ \ ({ \ u_int _i; \ __typeof(*DPCPU_PTR(n)) sum; \ \ sum = 0; \ CPU_FOREACH(_i) { \ sum += *DPCPU_ID_PTR(_i, n); \ } \ sum; \ }) #define DPCPU_VARSUM(n, var) __extension__ \ ({ \ u_int _i; \ __typeof((DPCPU_PTR(n))->var) sum; \ \ sum = 0; \ CPU_FOREACH(_i) { \ sum += (DPCPU_ID_PTR(_i, n))->var; \ } \ sum; \ }) #define DPCPU_ZERO(n) do { \ u_int _i; \ \ CPU_FOREACH(_i) { \ bzero(DPCPU_ID_PTR(_i, n), sizeof(*DPCPU_PTR(n))); \ } \ } while(0) #endif /* _KERNEL */ /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. The members are accessed via the PCPU_GET/SET/PTR * macros defined in . Machine dependent fields are * defined in the PCPU_MD_FIELDS macro defined in . */ struct pcpu { struct thread *pc_curthread; /* Current thread */ struct thread *pc_idlethread; /* Idle thread */ struct thread *pc_fpcurthread; /* Fp state owner */ struct thread *pc_deadthread; /* Zombie thread or NULL */ struct pcb *pc_curpcb; /* Current pcb */ uint64_t pc_switchtime; /* cpu_ticks() at last csw */ int pc_switchticks; /* `ticks' at last csw */ u_int pc_cpuid; /* This cpu number */ STAILQ_ENTRY(pcpu) pc_allcpu; struct lock_list_entry *pc_spinlocks; - struct vmmeter pc_cnt; /* VM stats counters */ long pc_cp_time[CPUSTATES]; /* statclock ticks */ struct device *pc_device; void *pc_netisr; /* netisr SWI cookie */ int pc_unused1; /* unused field */ int pc_domain; /* Memory domain. */ struct rm_queue pc_rm_queue; /* rmlock list of trackers */ uintptr_t pc_dynamic; /* Dynamic per-cpu data area */ + uint64_t pc_early_dummy_counter; /* Startup time counter(9) */ /* * Keep MD fields last, so that CPU-specific variations on a * single architecture don't result in offset variations of * the machine-independent fields of the pcpu. Even though * the pcpu structure is private to the kernel, some ports * (e.g., lsof, part of gtop) define _KERNEL and include this * header. While strictly speaking this is wrong, there's no * reason not to keep the offsets of the MI fields constant * if only to make kernel debugging easier. */ PCPU_MD_FIELDS; } __aligned(CACHE_LINE_SIZE); #ifdef CTASSERT /* * To minimize memory waste in per-cpu UMA zones, size of struct pcpu * should be denominator of PAGE_SIZE. */ CTASSERT((PAGE_SIZE / sizeof(struct pcpu)) * sizeof(struct pcpu) == PAGE_SIZE); #endif #ifdef _KERNEL STAILQ_HEAD(cpuhead, pcpu); extern struct cpuhead cpuhead; extern struct pcpu *cpuid_to_pcpu[]; #define curcpu PCPU_GET(cpuid) #define curproc (curthread->td_proc) #ifndef curthread #define curthread PCPU_GET(curthread) #endif #define curvidata PCPU_GET(vidata) /* Accessor to elements allocated via UMA_ZONE_PCPU zone. */ static inline void * zpcpu_get(void *base) { return ((char *)(base) + sizeof(struct pcpu) * curcpu); } static inline void * zpcpu_get_cpu(void *base, int cpu) { return ((char *)(base) + sizeof(struct pcpu) * cpu); } /* * Machine dependent callouts. cpu_pcpu_init() is responsible for * initializing machine dependent fields of struct pcpu, and * db_show_mdpcpu() is responsible for handling machine dependent * fields for the DDB 'show pcpu' command. */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size); void db_show_mdpcpu(struct pcpu *pcpu); void *dpcpu_alloc(int size); void dpcpu_copy(void *s, int size); void dpcpu_free(void *s, int size); void dpcpu_init(void *dpcpu, int cpuid); void pcpu_destroy(struct pcpu *pcpu); struct pcpu *pcpu_find(u_int cpuid); void pcpu_init(struct pcpu *pcpu, int cpuid, size_t size); #endif /* _KERNEL */ #endif /* !_SYS_PCPU_H_ */ Index: head/sys/sys/vmmeter.h =================================================================== --- head/sys/sys/vmmeter.h (revision 317060) +++ head/sys/sys/vmmeter.h (revision 317061) @@ -1,221 +1,221 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vmmeter.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_VMMETER_H_ #define _SYS_VMMETER_H_ /* * This value is used by ps(1) to change sleep state flag from 'S' to * 'I' and by the sched process to set the alarm clock. */ #define MAXSLP 20 +/* Systemwide totals computed every five seconds. */ +struct vmtotal { + int16_t t_rq; /* length of the run queue */ + int16_t t_dw; /* jobs in ``disk wait'' (neg priority) */ + int16_t t_pw; /* jobs in page wait */ + int16_t t_sl; /* jobs sleeping in core */ + int16_t t_sw; /* swapped out runnable/short block jobs */ + int32_t t_vm; /* total virtual memory */ + int32_t t_avm; /* active virtual memory */ + int32_t t_rm; /* total real memory in use */ + int32_t t_arm; /* active real memory */ + int32_t t_vmshr; /* shared virtual memory */ + int32_t t_avmshr; /* active shared virtual memory */ + int32_t t_rmshr; /* shared real memory */ + int32_t t_armshr; /* active shared real memory */ + int32_t t_free; /* free memory pages */ +}; + +#if defined(_KERNEL) || defined(_WANT_VMMETER) +#include + /* * System wide statistics counters. * Locking: * a - locked by atomic operations * c - constant after initialization * f - locked by vm_page_queue_free_mtx - * p - locked by being in the PCPU and atomicity respect to interrupts + * p - uses counter(9) * q - changes are synchronized by the corresponding vm_pagequeue lock */ struct vmmeter { /* * General system activity. */ - u_int v_swtch; /* (p) context switches */ - u_int v_trap; /* (p) calls to trap */ - u_int v_syscall; /* (p) calls to syscall() */ - u_int v_intr; /* (p) device interrupts */ - u_int v_soft; /* (p) software interrupts */ + counter_u64_t v_swtch; /* (p) context switches */ + counter_u64_t v_trap; /* (p) calls to trap */ + counter_u64_t v_syscall; /* (p) calls to syscall() */ + counter_u64_t v_intr; /* (p) device interrupts */ + counter_u64_t v_soft; /* (p) software interrupts */ /* * Virtual memory activity. */ - u_int v_vm_faults; /* (p) address memory faults */ - u_int v_io_faults; /* (p) page faults requiring I/O */ - u_int v_cow_faults; /* (p) copy-on-writes faults */ - u_int v_cow_optim; /* (p) optimized copy-on-writes faults */ - u_int v_zfod; /* (p) pages zero filled on demand */ - u_int v_ozfod; /* (p) optimized zero fill pages */ - u_int v_swapin; /* (p) swap pager pageins */ - u_int v_swapout; /* (p) swap pager pageouts */ - u_int v_swappgsin; /* (p) swap pager pages paged in */ - u_int v_swappgsout; /* (p) swap pager pages paged out */ - u_int v_vnodein; /* (p) vnode pager pageins */ - u_int v_vnodeout; /* (p) vnode pager pageouts */ - u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ - u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ - u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */ - u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */ - u_int v_pdpages; /* (p) pages analyzed by daemon */ - u_int v_pdshortfalls; /* (p) page reclamation shortfalls */ + counter_u64_t v_vm_faults; /* (p) address memory faults */ + counter_u64_t v_io_faults; /* (p) page faults requiring I/O */ + counter_u64_t v_cow_faults; /* (p) copy-on-writes faults */ + counter_u64_t v_cow_optim; /* (p) optimized COW faults */ + counter_u64_t v_zfod; /* (p) pages zero filled on demand */ + counter_u64_t v_ozfod; /* (p) optimized zero fill pages */ + counter_u64_t v_swapin; /* (p) swap pager pageins */ + counter_u64_t v_swapout; /* (p) swap pager pageouts */ + counter_u64_t v_swappgsin; /* (p) swap pager pages paged in */ + counter_u64_t v_swappgsout; /* (p) swap pager pages paged out */ + counter_u64_t v_vnodein; /* (p) vnode pager pageins */ + counter_u64_t v_vnodeout; /* (p) vnode pager pageouts */ + counter_u64_t v_vnodepgsin; /* (p) vnode_pager pages paged in */ + counter_u64_t v_vnodepgsout; /* (p) vnode pager pages paged out */ + counter_u64_t v_intrans; /* (p) intransit blocking page faults */ + counter_u64_t v_reactivated; /* (p) reactivated by the pagedaemon */ + counter_u64_t v_pdwakeups; /* (p) times daemon has awaken */ + counter_u64_t v_pdpages; /* (p) pages analyzed by daemon */ + counter_u64_t v_pdshortfalls; /* (p) page reclamation shortfalls */ - u_int v_dfree; /* (p) pages freed by daemon */ - u_int v_pfree; /* (p) pages freed by exiting processes */ - u_int v_tfree; /* (p) total pages freed */ + counter_u64_t v_dfree; /* (p) pages freed by daemon */ + counter_u64_t v_pfree; /* (p) pages freed by processes */ + counter_u64_t v_tfree; /* (p) total pages freed */ /* + * Fork/vfork/rfork activity. + */ + counter_u64_t v_forks; /* (p) fork() calls */ + counter_u64_t v_vforks; /* (p) vfork() calls */ + counter_u64_t v_rforks; /* (p) rfork() calls */ + counter_u64_t v_kthreads; /* (p) fork() calls by kernel */ + counter_u64_t v_forkpages; /* (p) pages affected by fork() */ + counter_u64_t v_vforkpages; /* (p) pages affected by vfork() */ + counter_u64_t v_rforkpages; /* (p) pages affected by rfork() */ + counter_u64_t v_kthreadpages; /* (p) ... and by kernel fork() */ +#define VM_METER_NCOUNTERS \ + (offsetof(struct vmmeter, v_page_size) / sizeof(counter_u64_t)) + /* * Distribution of page usages. */ u_int v_page_size; /* (c) page size in bytes */ u_int v_page_count; /* (c) total number of pages in system */ u_int v_free_reserved; /* (c) pages reserved for deadlock */ u_int v_free_target; /* (c) pages desired free */ u_int v_free_min; /* (c) pages desired free */ u_int v_free_count; /* (f) pages free */ u_int v_wire_count; /* (a) pages wired down */ u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ u_int v_laundry_count; /* (q) pages eligible for laundering */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ u_int v_free_severe; /* (c) severe page depletion point */ - /* - * Fork/vfork/rfork activity. - */ - u_int v_forks; /* (p) fork() calls */ - u_int v_vforks; /* (p) vfork() calls */ - u_int v_rforks; /* (p) rfork() calls */ - u_int v_kthreads; /* (p) fork() calls by kernel */ - u_int v_forkpages; /* (p) VM pages affected by fork() */ - u_int v_vforkpages; /* (p) VM pages affected by vfork() */ - u_int v_rforkpages; /* (p) VM pages affected by rfork() */ - u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ }; +#endif /* _KERNEL || _WANT_VMMETER */ + #ifdef _KERNEL extern struct vmmeter vm_cnt; - extern u_int vm_pageout_wakeup_thresh; +#define VM_CNT_ADD(var, x) counter_u64_add(vm_cnt.var, x) +#define VM_CNT_INC(var) VM_CNT_ADD(var, 1) +#define VM_CNT_FETCH(var) counter_u64_fetch(vm_cnt.var) + /* * Return TRUE if we are under our severe low-free-pages threshold * * This routine is typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ static inline int vm_page_count_severe(void) { return (vm_cnt.v_free_severe > vm_cnt.v_free_count); } /* * Return TRUE if we are under our minimum low-free-pages threshold. * * This routine is typically used within the system to determine whether * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more * desperate. */ static inline int vm_page_count_min(void) { return (vm_cnt.v_free_min > vm_cnt.v_free_count); } /* * Return TRUE if we have not reached our free page target during * free page recovery operations. */ static inline int vm_page_count_target(void) { return (vm_cnt.v_free_target > vm_cnt.v_free_count); } /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. */ static inline int vm_paging_target(void) { return (vm_cnt.v_free_target - vm_cnt.v_free_count); } /* * Returns TRUE if the pagedaemon needs to be woken up. */ static inline int vm_paging_needed(void) { return (vm_cnt.v_free_count < vm_pageout_wakeup_thresh); } /* * Return the number of pages we need to launder. * A positive number indicates that we have a shortfall of clean pages. */ static inline int vm_laundry_target(void) { return (vm_paging_target()); } - -/* - * Obtain the value of a per-CPU counter. - */ -#define VM_METER_PCPU_CNT(member) \ - vm_meter_cnt(__offsetof(struct vmmeter, member)) - -u_int vm_meter_cnt(size_t); - -#endif - -/* systemwide totals computed every five seconds */ -struct vmtotal { - int16_t t_rq; /* length of the run queue */ - int16_t t_dw; /* jobs in ``disk wait'' (neg priority) */ - int16_t t_pw; /* jobs in page wait */ - int16_t t_sl; /* jobs sleeping in core */ - int16_t t_sw; /* swapped out runnable/short block jobs */ - int32_t t_vm; /* total virtual memory */ - int32_t t_avm; /* active virtual memory */ - int32_t t_rm; /* total real memory in use */ - int32_t t_arm; /* active real memory */ - int32_t t_vmshr; /* shared virtual memory */ - int32_t t_avmshr; /* active shared virtual memory */ - int32_t t_rmshr; /* shared real memory */ - int32_t t_armshr; /* active shared real memory */ - int32_t t_free; /* free memory pages */ -}; - -#endif +#endif /* _KERNEL */ +#endif /* _SYS_VMMETER_H_ */ Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 317060) +++ head/sys/vm/swap_pager.c (revision 317061) @@ -1,2825 +1,2825 @@ /*- * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_swap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16 * or 32 pages per allocation. * The 32-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif /* * The swblock structure maps an object and a small, fixed-size range * of page indices to disk addresses within a swap area. * The collection of these mappings is implemented as a hash table. * Unused disk addresses within a swap area are allocated and managed * using a blist. */ #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) #define SWAP_META_PAGES (SWB_NPAGES * 2) #define SWAP_META_MASK (SWAP_META_PAGES - 1) struct swblock { struct swblock *swb_hnext; vm_object_t swb_object; vm_pindex_t swb_index; int swb_count; daddr_t swb_pages[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static vm_ooffset_t swap_total; SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, "Total amount of available swap storage."); static vm_ooffset_t swap_reserved; SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, "Amount of swap storage needed to back all allocated anonymous memory."); static int overcommit = 0; SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) int swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { vm_ooffset_t r, s; int res, error; static int curfail; static struct timeval lastfail; struct uidinfo *uip; uip = cred->cr_ruidinfo; if (incr & PAGE_MASK) panic("swap_reserve: & PAGE_MASK"); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (0); } #endif res = 0; mtx_lock(&sw_dev_mtx); r = swap_reserved + incr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count; s *= PAGE_SIZE; } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; swap_reserved = r; } mtx_unlock(&sw_dev_mtx); if (res) { UIDINFO_VMSIZE_LOCK(uip); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) res = 0; else uip->ui_vmsize += incr; UIDINFO_VMSIZE_UNLOCK(uip); if (!res) { mtx_lock(&sw_dev_mtx); swap_reserved -= incr; mtx_unlock(&sw_dev_mtx); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", uip->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (!res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (res); } void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; mtx_lock(&sw_dev_mtx); swap_reserved += incr; mtx_unlock(&sw_dev_mtx); #ifdef RACCT PROC_LOCK(curproc); racct_add_force(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); #endif uip = curthread->td_ucred->cr_ruidinfo; PROC_LOCK(curproc); UIDINFO_VMSIZE_LOCK(uip); uip->ui_vmsize += incr; UIDINFO_VMSIZE_UNLOCK(uip); PROC_UNLOCK(curproc); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curthread->td_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { struct uidinfo *uip; uip = cred->cr_ruidinfo; if (decr & PAGE_MASK) panic("swap_release: & PAGE_MASK"); mtx_lock(&sw_dev_mtx); if (swap_reserved < decr) panic("swap_reserved < decr"); swap_reserved -= decr; mtx_unlock(&sw_dev_mtx); UIDINFO_VMSIZE_LOCK(uip); if (uip->ui_vmsize < decr) printf("negative vmsize for uid = %d\n", uip->ui_uid); uip->ui_vmsize -= decr; UIDINFO_VMSIZE_UNLOCK(uip); racct_sub_cred(cred, RACCT_SWAP, decr); } #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static int nsw_rcount; /* free read buffers */ static int nsw_wcount_sync; /* limit write buffers / synchronous */ static int nsw_wcount_async; /* limit write buffers / asynchronous */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static struct swblock **swhash; static int swhash_mask; static struct mtx swhash_mtx; static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swap_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ }; /* * dmmax is in page-sized chunks with the new swap system. It was * dev-bsized chunks in the old. dmmax is always a power of 2. * * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int dmmax; static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, int npages); static daddr_t swp_pager_getswapspace(int npages); /* * Metadata functions */ static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWP_PAGER_HASH() - hash swap meta data * * This is an helper function which hashes the swapblk given * the object and page index. It returns a pointer to a pointer * to the object, or a pointer to a NULL pointer if it could not * find a swapblk. */ static struct swblock ** swp_pager_hash(vm_object_t object, vm_pindex_t index) { struct swblock **pswap; struct swblock *swap; index &= ~(vm_pindex_t)SWAP_META_MASK; pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; while ((swap = *pswap) != NULL) { if (swap->swb_object == object && swap->swb_index == index ) { break; } pswap = &swap->swb_hnext; } return (pswap); } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); /* * Device Stripe, in PAGE_SIZE'd blocks */ dmmax = SWB_NPAGES * 2; } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array (MAXPHYS/PAGE_SIZE) and our locally defined * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); mtx_lock(&pbuf_mtx); nsw_rcount = (nswbuf + 1) / 2; nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_unlock(&pbuf_mtx); /* * Initialize our zone. Right now I'm just guessing on the number * we need based on the number of pages in the system. Each swblock * can hold 32 pages, so this is probably overkill. This reservation * is typically limited to around 32MB by default. */ n = vm_cnt.v_page_count / 2; if (maxswzone && n > maxswzone / sizeof(struct swblock)) n = maxswzone / sizeof(struct swblock); n2 = n; swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); if (swap_zone == NULL) panic("failed to create swap_zone."); do { if (uma_zone_reserve_kva(swap_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); if (n2 != n) printf("Swap zone entries reduced from %lu to %lu.\n", n2, n); swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblock); n2 = n; /* * Initialize our meta-data hash table. The swapper does not need to * be quite as efficient as the VM system, so we do not use an * oversized hash table. * * n: size of hash table, must be power of 2 * swhash_mask: hash table index mask */ for (n = 1; n < n2 / 8; n *= 2) ; swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); swhash_mask = n - 1; mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } object->un_pager.swp.swp_bcount = 0; return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for the requested number of pages. The starting * swap block number (a page index) is returned or SWAPBLK_NONE * if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int npages) { daddr_t blk; struct swdevt *sp; int i; blk = SWAPBLK_NONE; mtx_lock(&sw_dev_mtx); sp = swdevhd; for (i = 0; i < nswapdev; i++) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if (!(sp->sw_flags & SW_CLOSING)) { blk = blist_alloc(sp->sw_blist, npages); if (blk != SWAPBLK_NONE) { blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); goto done; } } sp = TAILQ_NEXT(sp, sw_list); } if (swap_pager_full != 2) { printf("swap_pager_getswapspace(%d): failed\n", npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; done: mtx_unlock(&sw_dev_mtx); return (blk); } static int swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, int npages) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (blk >= sp->sw_first && blk < sp->sw_end) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { int n = 0; daddr_t blk = SWAPBLK_NONE; vm_pindex_t beg = start; /* save start index */ VM_OBJECT_WLOCK(object); while (size) { if (n == 0) { n = BLIST_MAX_ALLOC; while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { n >>= 1; if (n == 0) { swp_pager_meta_free(object, beg, start - beg); VM_OBJECT_WUNLOCK(object); return (-1); } } } swp_pager_meta_build(object, start, blk); --size; ++start; ++blk; --n; } swp_pager_meta_free(object, start, n); VM_OBJECT_WUNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && srcobject->handle != NULL) { vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } /* * transfer source to destination. */ for (i = 0; i < dstobject->size; ++i) { daddr_t dstaddr; /* * Locate (without changing) the swapblk on the destination, * unless it is invalid in which case free it silently, or * if the destination is a resident page, in which case the * source is thrown away. */ dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr == SWAPBLK_NONE) { /* * Destination has no swapblk and is not resident, * copy source. */ daddr_t srcaddr; srcaddr = swp_pager_meta_ctl( srcobject, i + offset, SWM_POP ); if (srcaddr != SWAPBLK_NONE) { /* * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); swp_pager_meta_build(dstobject, i, srcaddr); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } } else { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the sourceblock. */ swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); } } /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page must be locked. */ static void swap_pager_unswapped(vm_page_t m) { swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "m" of length "count". The caller * may optionally specify that additional pages preceding and succeeding * the specified range be paged in. The number of such pages is returned * in the "rbehind" and "rahead" parameters, and they will be in the * inactive queue upon return. * * The pages in "m" must be busied and will remain busied upon return. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, j, maxahead, maxbehind, reqcount, shift; reqcount = count; VM_OBJECT_WUNLOCK(object); bp = getpbuf(&nsw_rcount); VM_OBJECT_WLOCK(object); if (!swap_pager_haspage(object, m[0]->pindex, &maxbehind, &maxahead)) { relpbuf(bp, &nsw_rcount); return (VM_PAGER_FAIL); } /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = m[reqcount - 1]->pindex; msucc = TAILQ_NEXT(m[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = m[0]->pindex; mpred = TAILQ_PREV(m[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } /* * Allocate readahead and readbehind pages. */ shift = rbehind != NULL ? *rbehind : 0; if (shift != 0) { for (i = 1; i <= shift; i++) { p = vm_page_alloc(object, m[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift allocated pages to the left. */ for (j = 0; j < i - 1; j++) bp->b_pages[j] = bp->b_pages[j + shift - i + 1]; break; } bp->b_pages[shift - i] = p; } shift = i - 1; *rbehind = shift; } for (i = 0; i < reqcount; i++) bp->b_pages[i + shift] = m[i]; if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, m[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[shift + reqcount + i] = p; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); for (i = 0; i < count; i++) bp->b_pages[i]->oflags |= VPO_SWAPINPROG; pindex = bp->b_pages[0]->pindex; blk = swp_pager_meta_ctl(object, pindex, 0); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; - PCPU_INC(cnt.v_swapin); - PCPU_ADD(cnt.v_swappgsin, count); + VM_CNT_INC(v_swapin); + VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our m[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); while ((m[0]->oflags & VPO_SWAPINPROG) != 0) { m[0]->oflags |= VPO_SWAPSLEEP; - PCPU_INC(cnt.v_intrans); + VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (m[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, m, count, rbehind, rahead); VM_OBJECT_WUNLOCK(object); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, m, count, error); VM_OBJECT_WLOCK(object); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int i, n; boolean_t sync; if (count && m[0]->object != object) { panic("swap_pager_putpages: object mismatch %p/%p", object, m[0]->object ); } /* * Step 1 * * Turn object into OBJT_SWAP * check for bogus sysops * force sync if not pageout process */ if (object->type != OBJT_SWAP) swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_WUNLOCK(object); n = 0; if (curproc != pageproc) sync = TRUE; else sync = (flags & VM_PAGER_PUT_SYNC) != 0; /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { int j; struct buf *bp; daddr_t blk; /* * Maximum I/O size is limited by a number of factors. */ n = min(BLIST_MAX_ALLOC, count - i); n = min(n, nsw_cluster_max); /* * Get biggest block of swap we can. If we fail, fall * back and try to allocate a smaller block. Don't go * overboard trying to allocate space if it would overly * fragment swap. */ while ( (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && n > 4 ) { n >>= 1; } if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ if (sync == TRUE) { bp = getpbuf(&nsw_wcount_sync); } else { bp = getpbuf(&nsw_wcount_async); bp->b_flags = B_ASYNC; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { vm_page_t mreq = m[i+j]; swp_pager_meta_build( mreq->object, mreq->pindex, blk + j ); vm_page_dirty(mreq); mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_WUNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; - PCPU_INC(cnt.v_swapout); - PCPU_ADD(cnt.v_swappgsout, bp->b_npages); + VM_CNT_INC(v_swapout); + VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ if (sync == FALSE) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * report error */ if (bp->b_ioflags & BIO_ERROR) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->paging_in_progress); } if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ vm_page_dirty(m); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); m->valid = VM_PAGE_BITS_ALL; if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_lock(m); vm_page_deactivate_noreuse(m); vm_page_unlock(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ relpbuf( bp, ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : ((bp->b_flags & B_ASYNC) ? &nsw_wcount_async : &nsw_wcount_sync ) ) ); } /* * swap_pager_isswapped: * * Return 1 if at least one page in the given object is paged * out to the given swap device. * * This routine may not sleep. */ int swap_pager_isswapped(vm_object_t object, struct swdevt *sp) { daddr_t index = 0; int bcount; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return (0); mtx_lock(&swhash_mtx); for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { struct swblock *swap; if ((swap = *swp_pager_hash(object, index)) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { if (swp_pager_isondev(swap->swb_pages[i], sp)) { mtx_unlock(&swhash_mtx); return (1); } } } index += SWAP_META_PAGES; } mtx_unlock(&swhash_mtx); return (0); } int swap_pager_nswapdev(void) { return (nswapdev); } /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * * This routine dissociates the page at the given index within an object * from its backing store, paging it in if it does not reside in memory. * If the page is paged in, it is marked dirty and placed in the laundry * queue. The page is marked dirty because it no longer has backing * store. It is placed in the laundry queue because it has not been * accessed recently. Otherwise, it would already reside in memory. * * We also attempt to swap in all other pages in the swap block. * However, we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we * revert to the one-by-one behavior for now. Sigh. */ static inline void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; vm_object_pip_add(object, 1); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid == VM_PAGE_BITS_ALL) { vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); return; } if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK) panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { struct swblock *swap; vm_object_t locked_obj, object; vm_pindex_t pindex; int i, j, retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; locked_obj = NULL; full_rescan: mtx_lock(&swhash_mtx); for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ restart: for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { object = swap->swb_object; pindex = swap->swb_index; for (j = 0; j < SWAP_META_PAGES; ++j) { if (!swp_pager_isondev(swap->swb_pages[j], sp)) continue; if (locked_obj != object) { if (locked_obj != NULL) VM_OBJECT_WUNLOCK(locked_obj); locked_obj = object; if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(&swhash_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); mtx_lock(&swhash_mtx); goto restart; } } MPASS(locked_obj == object); mtx_unlock(&swhash_mtx); swp_pager_force_pagein(object, pindex + j); mtx_lock(&swhash_mtx); goto restart; } } } mtx_unlock(&swhash_mtx); if (locked_obj != NULL) { VM_OBJECT_WUNLOCK(locked_obj); locked_obj = NULL; } if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is freed. */ static void swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int exhausted; struct swblock *swap; struct swblock **pswap; int idx; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { object->type = OBJT_SWAP; object->un_pager.swp.swp_bcount = 0; KASSERT(object->handle == NULL, ("default pager with handle")); } /* * Locate hash entry. If not found create, but if we aren't adding * anything just return. If we run out of space in the map we wait * and, since the hash table may have changed, retry. */ retry: mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) == NULL) { int i; if (swapblk == SWAPBLK_NONE) goto done; swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (swap == NULL) { mtx_unlock(&swhash_mtx); VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swap_zone)) { if (atomic_cmpset_int(&exhausted, 0, 1)) printf("swap zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonex", 10); } else VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; } if (atomic_cmpset_int(&exhausted, 1, 0)) printf("swap zone ok\n"); swap->swb_hnext = NULL; swap->swb_object = object; swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; swap->swb_count = 0; ++object->un_pager.swp.swp_bcount; for (i = 0; i < SWAP_META_PAGES; ++i) swap->swb_pages[i] = SWAPBLK_NONE; } /* * Delete prior contents of metadata */ idx = pindex & SWAP_META_MASK; if (swap->swb_pages[idx] != SWAPBLK_NONE) { swp_pager_freeswapspace(swap->swb_pages[idx], 1); --swap->swb_count; } /* * Enter block into metadata */ swap->swb_pages[idx] = swapblk; if (swapblk != SWAPBLK_NONE) ++swap->swb_count; done: mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count) { struct swblock **pswap, *swap; vm_pindex_t c; daddr_t v; int n, sidx; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP || count == 0) return; mtx_lock(&swhash_mtx); for (c = 0; c < count;) { pswap = swp_pager_hash(object, index); sidx = index & SWAP_META_MASK; n = SWAP_META_PAGES - sidx; index += n; if ((swap = *pswap) == NULL) { c += n; continue; } for (; c < count && sidx < SWAP_META_PAGES; ++c, ++sidx) { if ((v = swap->swb_pages[sidx]) == SWAPBLK_NONE) continue; swp_pager_freeswapspace(v, 1); swap->swb_pages[sidx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; c += SWAP_META_PAGES - sidx; break; } } } mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblock **pswap, *swap; vm_pindex_t index; daddr_t v; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; index = 0; while (object->un_pager.swp.swp_bcount != 0) { mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { v = swap->swb_pages[i]; if (v != SWAPBLK_NONE) { --swap->swb_count; swp_pager_freeswapspace(v, 1); } } if (swap->swb_count != 0) panic( "swap_pager_meta_free_all: swb_count != 0"); *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } mtx_unlock(&swhash_mtx); index += SWAP_META_PAGES; } } /* * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. * * This routine is capable of looking up, popping, or freeing * swapblk assignments in the swap meta data or in the vm_page_t. * The routine typically returns the swapblk being looked-up, or popped, * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block * was invalid. This routine will automatically free any invalid * meta-data swapblks. * * It is not possible to store invalid swapblks in the swap meta data * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * SWM_FREE remove and free swap block from metadata * SWM_POP remove from meta data but do not free.. pop it out */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblock **pswap; struct swblock *swap; daddr_t r1; int idx; VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists of the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); r1 = SWAPBLK_NONE; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) != NULL) { idx = pindex & SWAP_META_MASK; r1 = swap->swb_pages[idx]; if (r1 != SWAPBLK_NONE) { if (flags & SWM_FREE) { swp_pager_freeswapspace(r1, 1); r1 = SWAPBLK_NONE; } if (flags & (SWM_FREE|SWM_POP)) { swap->swb_pages[idx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } } } mtx_unlock(&swhash_mtx); return (r1); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblock **pswap, *swap; vm_pindex_t i, j, lim; int idx; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP || object->un_pager.swp.swp_bcount == 0) return (object->size); mtx_lock(&swhash_mtx); for (j = pindex; j < object->size; j = lim) { pswap = swp_pager_hash(object, j); lim = rounddown2(j + SWAP_META_PAGES, SWAP_META_PAGES); if (lim > object->size) lim = object->size; if ((swap = *pswap) != NULL) { for (idx = j & SWAP_META_MASK, i = j; i < lim; i++, idx++) { if (swap->swb_pages[idx] != SWAPBLK_NONE) goto found; } } } i = object->size; found: mtx_unlock(&swhash_mtx); return (i); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swap_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message and return -1; otherwise, return 0. */ static int swapon_check_swzone(unsigned long npages) { unsigned long maxpages; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES; /* recommend using no more than half that amount */ if (npages > maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", npages, maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); return (-1); } return (0); } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf( "WARNING: reducing swap size to maximum of %luMB per unit\n", mblocks / 1024 / 1024 * PAGE_SIZE); nblks = mblocks; } sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_flags = 0; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, 2, nblks - 2); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks; swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; swapon_check_swzone(swap_total / PAGE_SIZE); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks, dvbase; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { swap_pager_avail -= blist_fill(sp->sw_blist, dvbase, dmmax); } swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp, NULL)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * vmspace_swap_count() - count the approximate swap usage in pages for a * vmspace. * * The map must be locked. * * Swap usage is determined by taking the proportional swap used by * VM objects backing the VM map. To make up for fractional losses, * if the VM object has any swap use at all the associated map entries * count for at least 1 swap page. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; long count, n; map = &vmspace->vm_map; count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = cur->object.vm_object) != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_SWAP && object->un_pager.swp.swp_bcount != 0) { n = (cur->end - cur->start) / PAGE_SIZE; count += object->un_pager.swp.swp_bcount * SWAP_META_PAGES * n / object->size + 1; } VM_OBJECT_WUNLOCK(object); } } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp, 0); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&pbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&pbuf_mtx); return (0); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 317060) +++ head/sys/vm/vm_fault.c (revision 317061) @@ -1,1719 +1,1719 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward); static inline void release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); vm_page_lock(fs->m); vm_page_deactivate(fs->m); vm_page_unlock(fs->m); fs->m = NULL; } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_WUNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_lock(fs->first_m); vm_page_free(fs->first_m); vm_page_unlock(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot, vm_prot_t fault_type, int fault_flags, bool set_wd) { bool need_dirty; if (((prot & VM_PROT_WRITE) == 0 && (fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_OBJECT_ASSERT_LOCKED(m->object); need_dirty = ((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_WIRE) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0; if (set_wd) vm_object_set_writeable_dirty(m->object); else /* * If two callers of vm_fault_dirty() with set_wd == * FALSE, one for the map entry with MAP_ENTRY_NOSYNC * flag set, other with flag clear, race, it is * possible for the no-NOSYNC thread to see m->dirty * != 0 and not clear VPO_NOSYNC. Take vm_page lock * around manipulation of VPO_NOSYNC and * vm_page_dirty() call, to avoid the race and keep * m->oflags consistent. */ vm_page_lock(m); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) { if (m->dirty == 0) { m->oflags |= VPO_NOSYNC; } } else { m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (need_dirty) vm_page_dirty(m); if (!set_wd) vm_page_unlock(m); if (need_dirty) vm_pager_page_unswapped(m); } static void vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m) { if (m_hold != NULL) { *m_hold = m; vm_page_lock(m); vm_page_hold(m); vm_page_unlock(m); } } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m; int rv; MPASS(fs->vp == NULL); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) return (KERN_FAILURE); rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0); if (rv != KERN_SUCCESS) return (rv); vm_fault_fill_hold(m_hold, m); vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false); VM_OBJECT_RUNLOCK(fs->first_object); if (!wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(fs->first_object->paging_in_progress > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(m->valid == VM_PAGE_BITS_ALL); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_lock(m); vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(fs->first_object->paging_in_progress > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, true); VM_OBJECT_WUNLOCK(fs->first_object); pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset, m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); VM_OBJECT_WLOCK(fs->first_object); if (pidx == fs->first_pindex) vm_fault_fill_hold(m_hold, m); vm_page_lock(m); if ((fault_flags & VM_FAULT_WIRE) != 0) { KASSERT(wired, ("VM_FAULT_WIRE && !wired")); vm_page_wire(m); } else { vm_page_activate(m); } vm_page_unlock(m); vm_page_xunbusy(m); } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } /* * vm_fault: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) { struct thread *td; int result; td = curthread; if ((td->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags, NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND)) ktrfaultend(result); #endif return (result); } int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; struct vnode *vp; vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; int locked, nera, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, growstack, hardfault, is_first_object_locked; - PCPU_INC(cnt.v_vm_faults); + VM_CNT_INC(v_vm_faults); fs.vp = NULL; faultcount = 0; nera = -1; growstack = true; hardfault = false; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = false; goto RetryFault; } unlock_vp(&fs); return (result); } fs.map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(&fs); fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); else KASSERT((fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are both (1) mapping an existing page from the top- * level object and (2) not having to mark that object as containing * dirty pages. Under these conditions, a read lock on the top-level * object suffices, allowing multiple page faults of a similar type to * run in parallel on the same top-level object. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 && /* avoid calling vm_object_set_writeable_dirty() */ ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) { VM_OBJECT_RLOCK(fs.first_object); if ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) { rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); if (rv == KERN_SUCCESS) return (rv); } if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = true; fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * Wait/Retry if the page is busy. We have to do this * if the page is either exclusive or shared busy * because the vm_pager may be using read busy for * pageouts (and even pageins if it is the vnode * pager), and we could end up trying to pagein and * pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a shared busied page except, perhaps, * to pmap it. */ if (vm_page_busied(fs.m)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); - PCPU_INC(cnt.v_intrans); + VM_CNT_INC(v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_lock(fs.m); vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_xbusy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m)); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } if (fs.object == fs.first_object && (fs.first_object->flags & OBJ_POPULATE) != 0 && fs.first_object->shadow_count == 0) { rv = vm_fault_populate(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: unlock_and_deallocate(&fs); return (rv); case KERN_RESOURCE_SHORTAGE: unlock_and_deallocate(&fs); goto RetryFault; case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs.object, atop(vaddr) - fs.pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } } readrest: /* * At this point, we have either allocated a new page or found * an existing page that is only partially valid. * * We hold a reference on the current object and the page is * exclusive busied. */ /* * If the pager for the current object might have the page, * then determine the number of additional pages to read and * potentially reprioritize previously read pages for earlier * reclamation. These operations should only be performed * once per page fault. Even if the current pager doesn't * have the page, the number of additional pages to read will * apply to subsequent objects in the shadow chain. */ if (fs.object->type != OBJT_DEFAULT && nera == -1 && !P_KILLED(curproc)) { KASSERT(fs.lookup_still_valid, ("map unlocked")); era = fs.entry->read_ahead; behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (vaddr == fs.entry->next_read) vm_fault_dontneed(&fs, vaddr, nera); } else if (vaddr == fs.entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(&fs, vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs.entry->read_ahead = nera; } /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs.entry->start; e_end = fs.entry->end; } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ if (fs.object->type != OBJT_DEFAULT) { /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE && (vp = fs.object->handle) != fs.vp) { /* * Perform an unlock in case the desired vnode * changed while the map was unlocked during a * retry. */ unlock_vp(&fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock * while we have the page exclusive busied or * the object's paging-in-progress count * incremented. Otherwise, we could deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs.pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - vaddr) - 1); } rv = vm_pager_get_pages(fs.object, &fs.m, 1, &behind, &ahead); if (rv == VM_PAGER_OK) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND */ } if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { vm_page_lock(fs.m); if (fs.m->wire_count == 0) vm_page_free(fs.m); else vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return (rv == VM_PAGER_ERROR ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs.object != fs.first_object) { vm_page_lock(fs.m); if (fs.m->wire_count == 0) vm_page_free(fs.m); else vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { - PCPU_INC(cnt.v_ozfod); + VM_CNT_INC(v_ozfod); } - PCPU_INC(cnt.v_zfod); + VM_CNT_INC(v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } vm_page_assert_xbusied(fs.m); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { vm_page_lock(fs.m); vm_page_remove(fs.m); vm_page_unlock(fs.m); vm_page_lock(fs.first_m); vm_page_replace_checked(fs.m, fs.first_object, fs.first_pindex, fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_page_dirty(fs.m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs.m, fs.first_object, fs.object, OFF_TO_IDX( fs.first_object->backing_object_offset)); #endif /* * Removing the page from the backing object * unbusied it. */ vm_page_xbusy(fs.m); fs.first_m = fs.m; fs.m = NULL; - PCPU_INC(cnt.v_cow_optim); + VM_CNT_INC(v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_WIRE) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, PQ_INACTIVE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); - PCPU_INC(cnt.v_cow_faults); + VM_CNT_INC(v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = true; if (fs.map->timestamp != fs.map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); vm_page_assert_xbusied(fs.m); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 && wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fault_flags & VM_FAULT_WIRE) != 0) { KASSERT(wired, ("VM_FAULT_WIRE && !wired")); vm_page_wire(fs.m); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_xunbusy(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { - PCPU_INC(cnt.v_io_faults); + VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); first_object = fs->first_object; if (first_object != object) { if (!VM_OBJECT_TRYWLOCK(first_object)) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WLOCK(first_object); VM_OBJECT_WLOCK(object); } } /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (m->valid != VM_PAGE_BITS_ALL || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. */ vm_page_lock(m); vm_page_deactivate(m); vm_page_unlock(m); } } } if (first_object != object) VM_OBJECT_WUNLOCK(first_object); } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { VM_OBJECT_RUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. */ for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) { vm_page_lock(*mp); vm_page_unhold(*mp); vm_page_unlock(*mp); } return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, atop(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if (dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; } else { dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; vm_page_xbusy(dst_m); KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, ("invalid dst page %p", dst_m)); } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_lock(src_m); vm_page_unwire(src_m, PQ_INACTIVE); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { KASSERT(dst_m->wire_count > 0, ("dst_m %p is not wired", dst_m)); } } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: head/sys/vm/vm_meter.c =================================================================== --- head/sys/vm/vm_meter.c (revision 317060) +++ head/sys/vm/vm_meter.c (revision 317061) @@ -1,322 +1,336 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -struct vmmeter vm_cnt; +struct vmmeter vm_cnt = { + .v_swtch = EARLY_COUNTER, + .v_trap = EARLY_COUNTER, + .v_syscall = EARLY_COUNTER, + .v_intr = EARLY_COUNTER, + .v_soft = EARLY_COUNTER, + .v_vm_faults = EARLY_COUNTER, + .v_io_faults = EARLY_COUNTER, + .v_cow_faults = EARLY_COUNTER, + .v_cow_optim = EARLY_COUNTER, + .v_zfod = EARLY_COUNTER, + .v_ozfod = EARLY_COUNTER, + .v_swapin = EARLY_COUNTER, + .v_swapout = EARLY_COUNTER, + .v_swappgsin = EARLY_COUNTER, + .v_swappgsout = EARLY_COUNTER, + .v_vnodein = EARLY_COUNTER, + .v_vnodeout = EARLY_COUNTER, + .v_vnodepgsin = EARLY_COUNTER, + .v_vnodepgsout = EARLY_COUNTER, + .v_intrans = EARLY_COUNTER, + .v_reactivated = EARLY_COUNTER, + .v_pdwakeups = EARLY_COUNTER, + .v_pdpages = EARLY_COUNTER, + .v_pdshortfalls = EARLY_COUNTER, + .v_dfree = EARLY_COUNTER, + .v_pfree = EARLY_COUNTER, + .v_tfree = EARLY_COUNTER, + .v_forks = EARLY_COUNTER, + .v_vforks = EARLY_COUNTER, + .v_rforks = EARLY_COUNTER, + .v_kthreads = EARLY_COUNTER, + .v_forkpages = EARLY_COUNTER, + .v_vforkpages = EARLY_COUNTER, + .v_rforkpages = EARLY_COUNTER, + .v_kthreadpages = EARLY_COUNTER, +}; +static void +vmcounter_startup(void) +{ + counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; + + COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK); +} +SYSINIT(counter, SI_SUB_CPU, SI_ORDER_FOURTH + 1, vmcounter_startup, NULL); + SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 u_int32_t la[4]; if (req->flags & SCTL_MASK32) { la[0] = averunnable.ldavg[0]; la[1] = averunnable.ldavg[1]; la[2] = averunnable.ldavg[2]; la[3] = averunnable.fscale; return SYSCTL_OUT(req, la, sizeof(la)); } else #endif return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history"); /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ static bool is_object_active(vm_object_t obj) { return (obj->ref_count > obj->shadow_count); } static int vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; vm_object_t object; struct proc *p; struct thread *td; bzero(&total, sizeof(total)); /* * Calculate process statistics. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if ((p->p_flag & P_SYSTEM) != 0) continue; PROC_LOCK(p); if (p->p_state != PRS_NEW) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_INHIBITED: if (TD_IS_SWAPPED(td)) total.t_sw++; else if (TD_IS_SLEEPING(td)) { if (td->td_priority <= PZERO) total.t_dw++; else total.t_sl++; if (td->td_wchan == &vm_cnt.v_free_count) total.t_pw++; } break; case TDS_CAN_RUN: total.t_sw++; break; case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; break; default: break; } thread_unlock(td); } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Calculate object memory usage statistics. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { /* * Perform unsynchronized reads on the object. In * this case, the lack of synchronization should not * impair the accuracy of the reported statistics. */ if ((object->flags & OBJ_FICTITIOUS) != 0) { /* * Devices, like /dev/mem, will badly skew our totals. */ continue; } if (object->ref_count == 0) { /* * Also skip unreferenced objects, including * vnodes representing mounted file systems. */ continue; } if (object->ref_count == 1 && (object->flags & OBJ_NOSPLIT) != 0) { /* * Also skip otherwise unreferenced swap * objects backing tmpfs vnodes, and POSIX or * SysV shared memory. */ continue; } total.t_vm += object->size; total.t_rm += object->resident_page_count; if (is_object_active(object)) { total.t_avm += object->size; total.t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ total.t_vmshr += object->size; total.t_rmshr += object->resident_page_count; if (is_object_active(object)) { total.t_avmshr += object->size; total.t_armshr += object->resident_page_count; } } } mtx_unlock(&vm_object_list_mtx); total.t_free = vm_cnt.v_free_count; return (sysctl_handle_opaque(oidp, &total, sizeof(total), req)); } -/* - * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt - * structure. - * - * The vmmeter structure is now per-cpu as well as global. Those - * statistics which can be kept on a per-cpu basis (to avoid cache - * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining - * statistics, such as v_free_reserved, are left in the global - * structure. - */ -u_int -vm_meter_cnt(size_t offset) -{ - struct pcpu *pcpu; - u_int count; - int i; - - count = *(u_int *)((char *)&vm_cnt + offset); - CPU_FOREACH(i) { - pcpu = pcpu_find(i); - count += *(u_int *)((char *)&pcpu->pc_cnt + offset); - } - return (count); -} - -static int -cnt_sysctl(SYSCTL_HANDLER_ARGS) -{ - u_int count; - - count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt); - return (SYSCTL_OUT(req, &count, sizeof(count))); -} - SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); #define VM_STATS(parent, var, descr) \ - SYSCTL_PROC(parent, OID_AUTO, var, \ - CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \ - cnt_sysctl, "IU", descr) + SYSCTL_COUNTER_U64(parent, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) VM_STATS_SYS(v_swtch, "Context switches"); VM_STATS_SYS(v_trap, "Traps"); VM_STATS_SYS(v_syscall, "System calls"); VM_STATS_SYS(v_intr, "Device interrupts"); VM_STATS_SYS(v_soft, "Software interrupts"); VM_STATS_VM(v_vm_faults, "Address memory faults"); VM_STATS_VM(v_io_faults, "Page faults requiring I/O"); VM_STATS_VM(v_cow_faults, "Copy-on-write faults"); VM_STATS_VM(v_cow_optim, "Optimized COW faults"); VM_STATS_VM(v_zfod, "Pages zero-filled on demand"); VM_STATS_VM(v_ozfod, "Optimized zero fill pages"); VM_STATS_VM(v_swapin, "Swap pager pageins"); VM_STATS_VM(v_swapout, "Swap pager pageouts"); VM_STATS_VM(v_swappgsin, "Swap pages swapped in"); VM_STATS_VM(v_swappgsout, "Swap pages swapped out"); VM_STATS_VM(v_vnodein, "Vnode pager pageins"); VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon"); VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); VM_STATS_VM(v_tfree, "Total pages freed"); -VM_STATS_VM(v_page_size, "Page size in bytes"); -VM_STATS_VM(v_page_count, "Total number of pages in system"); -VM_STATS_VM(v_free_reserved, "Pages reserved for deadlock"); -VM_STATS_VM(v_free_target, "Pages desired free"); -VM_STATS_VM(v_free_min, "Minimum low-free-pages threshold"); -VM_STATS_VM(v_free_count, "Free pages"); -VM_STATS_VM(v_wire_count, "Wired pages"); -VM_STATS_VM(v_active_count, "Active pages"); -VM_STATS_VM(v_inactive_target, "Desired inactive pages"); -VM_STATS_VM(v_inactive_count, "Inactive pages"); -VM_STATS_VM(v_laundry_count, "Pages eligible for laundering"); -VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); -VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_VM(v_forks, "Number of fork() calls"); VM_STATS_VM(v_vforks, "Number of vfork() calls"); VM_STATS_VM(v_rforks, "Number of rfork() calls"); VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel"); VM_STATS_VM(v_forkpages, "VM pages affected by fork()"); VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()"); VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); + +#define VM_STATS_UINT(var, descr) \ + SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) +VM_STATS_UINT(v_page_size, "Page size in bytes"); +VM_STATS_UINT(v_page_count, "Total number of pages in system"); +VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock"); +VM_STATS_UINT(v_free_target, "Pages desired free"); +VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold"); +VM_STATS_UINT(v_free_count, "Free pages"); +VM_STATS_UINT(v_wire_count, "Wired pages"); +VM_STATS_UINT(v_active_count, "Active pages"); +VM_STATS_UINT(v_inactive_target, "Desired inactive pages"); +VM_STATS_UINT(v_inactive_count, "Inactive pages"); +VM_STATS_UINT(v_laundry_count, "Pages eligible for laundering"); +VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel"); +VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code"); +VM_STATS_UINT(v_free_severe, "Severe page depletion point"); #ifdef COMPAT_FREEBSD11 /* * Provide compatibility sysctls for the benefit of old utilities which exit * with an error if they cannot be found. */ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); #endif Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 317060) +++ head/sys/vm/vm_object.c (revision 317061) @@ -1,2644 +1,2644 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_init(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); - PCPU_INC(cnt.v_pfree); + VM_CNT_INC(v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) && (object->flags & OBJ_ONEMAPPING) != 0); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE && object->type == OBJT_SWAP) swap_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. */ if (tm->valid != VM_PAGE_BITS_ALL) goto next_pindex; vm_page_lock(tm); if (tm->hold_count != 0 || tm->wire_count != 0) { vm_page_unlock(tm); goto next_pindex; } KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_busied(tm)) { if (object != tobject) VM_OBJECT_WUNLOCK(tobject); VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_advise(tm, advice); vm_page_unlock(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt", false); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol", false); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); object_collapses++; } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax", true); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar", false); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo.kvo_size = ptoa(obj->size); kvo.kvo_resident = obj->resident_page_count; kvo.kvo_ref_count = obj->ref_count; kvo.kvo_shadow_count = obj->shadow_count; kvo.kvo_memattr = obj->memattr; kvo.kvo_active = 0; kvo.kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (vm_page_active(m)) kvo.kvo_active++; else if (vm_page_inactive(m)) kvo.kvo_inactive++; } kvo.kvo_vn_fileid = 0; kvo.kvo_vn_fsid = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo.kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo.kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo.kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo.kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo.kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo.kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo.kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo.kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo.kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo.kvo_vn_fileid = va.va_fileid; kvo.kvo_vn_fsid = va.va_fsid; } vput(vp); } strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo.kvo_path) + 1; kvo.kvo_structsize = roundup(kvo.kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 317060) +++ head/sys/vm/vm_page.c (revision 317061) @@ -1,3624 +1,3624 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); /* Is the page daemon waiting for free pages? */ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_wakeup(void); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; mtx_lock(&vm_page_queue_free_mtx); ret = vm_phys_unfree_page(m); mtx_unlock(&vm_page_queue_free_mtx); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = &vm_cnt.v_laundry_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; /* Unswappable dirty pages are counted as being in the laundry. */ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) = &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t high_avail, low_avail, page_range, size; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; int biggestone; int pages_per_zone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Almost all of the pages needed for bootstrapping UMA are used * for zone structures, so if the number of CPUs results in those * structures taking more than one page each, we set aside more pages * in proportion to the zone structure size. */ pages_per_zone = howmany(sizeof(struct uma_zone) + sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE); if (pages_per_zone > 1) { /* Reserve more pages so that we don't run out. */ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; } /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically * included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); #elif defined(VM_PHYSSEG_DENSE) /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end == high_avail) page_range = (high_avail - low_avail) / (PAGE_SIZE + sizeof(struct vm_page)); else page_range = high_avail / PAGE_SIZE - first_page; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ if (high_avail == end) high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { vm_phys_add_page(pa); pa += PAGE_SIZE; } } TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { vm_page_t m, mpred; int flags, req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* * Allocate a page if the number of free pages exceeds the minimum * for the request class. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) { /* * Can we allocate the page from a reservation? */ #if VM_NRESERVLEVEL > 0 if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = vm_reserv_alloc_page(object, pindex, mpred)) == NULL) #endif { /* * If not, allocate it from the free page queues. */ m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive()) { m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; int req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count >= npages)) { /* * Can we allocate the pages from a reservation? */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, low, high, alignment, boundary, mpred)) == NULL) #endif /* * If not, allocate them from the free page queues. */ m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) vm_phys_freecnt_adj(m_ret, -npages); else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) goto retry; #endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); for (m = m_ret; m < &m_ret[npages]; m++) vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int( &vm_cnt.v_wire_count, npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, ("page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("page %p is wired", m)); KASSERT(m->hold_count == 0, ("page %p is held", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_freelist(int flind, int req) { vm_page_t m; u_int flags; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count > 0)) m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx, *new_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } m_inc = 1; retry: if (m->wire_count != 0 || m->hold_count != 0) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && m->queue != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct mtx *m_mtx, *new_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } retry: if (m->wire_count != 0 || m->hold_count != 0) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(m_new->wire_count == 0, ("page %p is wired", m)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); m->valid = 0; vm_page_undirty(m); /* * The new page must be deactivated * before the object is unlocked. */ new_mtx = vm_page_lockptr(m_new); if (m_mtx != new_mtx) { mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } SLIST_INSERT_HEAD(&free, m, plinks.s.ss); } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { mtx_lock(&vm_page_queue_free_mtx); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif mtx_unlock(&vm_page_queue_free_mtx); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { mtx_lock(&vm_page_queue_free_mtx); do { SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (true) #endif vm_phys_free_pages(m, 0); } while ((m = SLIST_FIRST(&free)) != NULL); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform VM_WAIT before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ count = vm_cnt.v_free_count; if (count < npages + vm_cnt.v_free_reserved || (count < npages + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (__predict_false(pageproc == NULL)) panic("vm_wait in early boot"); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { if (vm_page_in_laundry(m)) return (&vm_dom[0].vmd_pagequeues[m->queue]); else return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); if (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE) pq = &vm_dom[0].vmd_pagequeues[queue]; else pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq(). This routine is called * when a page is added to the free queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } } /* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); - PCPU_INC(cnt.v_tfree); + VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the physical memory allocator's free * page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive * queue. However, setting "noreuse" to TRUE will accelerate the specified * page's reclamation, but it will not unmap the page from any address space. * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* * vm_page_launder * * Put a page in the laundry. */ void vm_page_launder(vm_page_t m) { int queue; vm_page_assert_locked(m); if ((queue = m->queue) != PQ_LAUNDRY) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_LAUNDRY, m); } else KASSERT(queue == PQ_NONE, ("wired page %p is queued", m)); } } /* * vm_page_unswappable * * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); if (m->queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_UNSWAPPABLE, m); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* * vm_page_advise * * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * vm_page_ps_is_valid: * * Returns TRUE if the entire (super)page is valid and FALSE otherwise. */ boolean_t vm_page_ps_is_valid(vm_page_t m) { int i, npages; VM_OBJECT_ASSERT_LOCKED(m->object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { if (m[i].valid != VM_PAGE_BITS_ALL) return (FALSE); } return (TRUE); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_cnt.v_free_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 317060) +++ head/sys/vm/vm_pageout.c (revision 317061) @@ -1,2321 +1,2321 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 2 int vm_pageout_deficit; /* Estimated number of pages deficit */ u_int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ bool vm_pages_needed; /* Are threads waiting for free pages? */ /* Pending request for dirty page laundering. */ static enum { VM_LAUNDRY_IDLE, VM_LAUNDRY_BACKGROUND, VM_LAUNDRY_SHORTFALL } vm_laundry_request = VM_LAUNDRY_IDLE; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; static int swapdev_enabled; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; static int vm_swap_idle_enabled = 0; #else static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_target; SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, &vm_background_launder_target, 0, "background laundering target, in pages"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it write busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queue * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; /* * We can't clean the page if it is busy or held. */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("page %p is held", m)); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_sbusy(mc[i]); pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * becase no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ vm_page_lock(mt); if (object->type == OBJT_SWAP && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (vm_page_busied(p)) continue; - PCPU_INC(cnt.v_pdpages); + VM_CNT_INC(v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } if (!vm_page_active(p) && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; } else if (vm_page_active(p)) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } } else if (vm_page_inactive(p)) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or held while the object * and page locks were released. */ if (vm_page_busied(m) || m->hold_count != 0) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, next; int act_delta, error, maxscan, numpagedout, starting_target; int vnodes_skipped; bool pageout_ok, queue_locked; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * maxscan ensures that we don't re-examine requeued pages. Any * additional pages written as part of a cluster are subtracted from * maxscan since they must be taken from the laundry queue. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]; else pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; scan: vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; queue_locked = true; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && launder > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked laundry queue")); KASSERT(vm_page_in_laundry(m), ("page %p has an inconsistent queue", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { vm_page_unlock(m); continue; } object = m->object; if ((!VM_OBJECT_TRYWLOCK(object) && (!vm_pageout_fallback_object_lock(m, &next) || m->hold_count != 0)) || vm_page_busied(m)) { VM_OBJECT_WUNLOCK(object); vm_page_unlock(m); continue; } /* * Unlock the laundry queue, invalidating the 'next' pointer. * Use a marker to remember our place in the laundry queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, plinks.q); vm_pagequeue_unlock(pq); queue_locked = false; /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) act_delta += pmap_ts_referenced(m); else { KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { - PCPU_INC(cnt.v_reactivated); + VM_CNT_INC(v_reactivated); vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the laundry queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; /* * If this was a background laundering, count * activated pages towards our target. The * purpose of background laundering is to ensure * that pages are eventually cycled through the * laundry queue, and an activation is a valid * way out. */ if (!in_shortfall) launder--; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) goto requeue_page; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: vm_page_free(m); - PCPU_INC(cnt.v_dfree); + VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = true; else if (disable_swap_pageouts) pageout_ok = false; else pageout_ok = true; if (!pageout_ok) { requeue_page: vm_pagequeue_lock(pq); queue_locked = true; vm_page_requeue_locked(m); goto drop_page; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; maxscan -= numpagedout - 1; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } goto relock_queue; } drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = true; } next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); } vm_pagequeue_unlock(pq); if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = 1u << ((NBBY * sizeof(u_int)) - 2); while (bit > num) bit >>= 2; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *domain; struct vm_pagequeue *pq; uint64_t nclean, ndirty; u_int last_launder, wakeups; int domidx, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(domain->vmd_segs != 0, ("domain without segments")); vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; target = 0; last_launder = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; - wakeups = VM_METER_PCPU_CNT(v_pdwakeups); + wakeups = VM_CNT_FETCH(v_pdwakeups); /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } last_launder = wakeups; launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold and the pagedaemon has * been woken up to reclaim pages since our last * laundering, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * page daemon wakeups since the last laundering. Thus, as the * ratio of dirty to clean inactive pages grows, the amount of * memory pressure required to trigger laundering decreases. */ trybackground: nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; ndirty = vm_cnt.v_laundry_count; if (target == 0 && wakeups != last_launder && ndirty * isqrt(wakeups - last_launder) >= nclean) { target = vm_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * wakeup, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (wakeups != last_launder) { last_launder = wakeups; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(domain, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vm_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target() + vm_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vm_laundry_request = VM_LAUNDRY_IDLE; vm_pagequeue_unlock(pq); } } /* * vm_pageout_scan does the dirty work for the pageout daemon. * * pass == 0: Update active LRU/deactivate pages * pass >= 1: Free inactive pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. */ static bool vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; int page_shortage, scan_tick, scanned, starting_page_shortage; boolean_t queue_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages that we want to free. This number * can be negative if many pages are freed between the wakeup call to * the page daemon and this calculation. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); - PCPU_INC(cnt.v_pdpages); + VM_CNT_INC(v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of inactq_shortage before the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(m->hold_count == 0, ("Held page %p", m)); /* * Dequeue the inactive page and unlock the inactive page * queue, invalidating the 'next' pointer. Dequeueing the * page here avoids a later reacquisition (and release) of * the inactive page queue lock when vm_page_activate(), * vm_page_free(), or vm_page_launder() is called. Use a * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { - PCPU_INC(cnt.v_reactivated); + VM_CNT_INC(v_reactivated); vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) { vm_pagequeue_lock(pq); queue_locked = TRUE; m->queue = PQ_INACTIVE; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); goto drop_page; } } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: vm_page_free(m); - PCPU_INC(cnt.v_dfree); + VM_CNT_INC(v_dfree); --page_shortage; } else if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. */ if (vm_laundry_request == VM_LAUNDRY_IDLE && starting_page_shortage > 0) { pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled)) { if (page_shortage > 0) { vm_laundry_request = VM_LAUNDRY_SHORTFALL; - PCPU_INC(cnt.v_pdshortfalls); + VM_CNT_INC(v_pdshortfalls); } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) vm_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vm_laundry_request); } vm_pagequeue_unlock(pq); } #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (vm_swap_enabled && page_shortage > 0) vm_req_vmdaemon(VM_SWAP_NORMAL); #endif /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages, we make clean pages count more heavily * towards the page shortage than dirty pages. This is because dirty * pages must be laundered before they can be reused and thus have less * utility when attempting to quickly alleviate a shortage. However, * this weighting also causes the scan to deactivate dirty pages more * more aggressively, improving the effectiveness of clustering and * ensuring that they can eventually be reused. */ inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + vm_cnt.v_laundry_count / act_scan_laundry_weight) + vm_paging_target() + deficit + addl_page_shortage; page_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for page daemon pages is updated after checking * the page for eligibility. */ - PCPU_INC(cnt.v_pdpages); + VM_CNT_INC(v_pdpages); /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); /* * When not short for inactive pages, let dirty pages go * through the inactive queue before moving to the * laundry queues. This gives them some extra time to * be reactivated, potentially avoiding an expensive * pageout. During a page shortage, the inactive queue * is necessarily small, so we may move dirty pages * directly to the laundry queue. */ if (inactq_shortage <= 0) vm_page_deactivate(m); else { /* * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, * directing dirty pages into the laundry * queue is only an optimization and not a * requirement. Therefore, we simply rely on * the opportunistic updates to the page's * dirty field by the pmap. */ if (m->dirty == 0) { vm_page_deactivate(m); inactq_shortage -= act_scan_laundry_weight; } else { vm_page_launder(m); inactq_shortage--; } } } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second when we are reclaiming * pages. */ if (vm_swap_idle_enabled && pass > 0) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } } #endif return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); wakeup(&vm_cnt.v_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *domain; int domidx, pass; bool target_met; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; pass = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { mtx_lock(&vm_page_queue_free_mtx); /* * Generally, after a level >= 1 scan, if there are enough * free pages to wakeup the waiters, then they are already * awake. A call to vm_page_free() during the scan awakened * them. However, in the following case, this wakeup serves * to bound the amount of time that a thread might wait. * Suppose a thread's call to vm_page_alloc() fails, but * before that thread calls VM_WAIT, enough pages are freed by * other threads to alleviate the free page shortage. The * thread will, nonetheless, wait until another page is freed * or this wakeup is performed. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } /* * Do not clear vm_pageout_wanted until we reach our free page * target. Otherwise, we may be awakened over and over again, * wasting CPU time. */ if (vm_pageout_wanted && target_met) vm_pageout_wanted = false; /* * Might the page daemon receive a wakeup call? */ if (vm_pageout_wanted) { /* * No. Either vm_pageout_wanted was set by another * thread during the previous scan, which must have * been a level 0 scan, or vm_pageout_wanted was * already set and the scan failed to free enough * pages. If we haven't yet performed a level >= 1 * (page reclamation) scan, then increase the level * and scan again now. Otherwise, sleep a bit and * try again later. */ mtx_unlock(&vm_page_queue_free_mtx); if (pass >= 1) pause("psleep", hz / VM_INACT_SCAN_RATE); pass++; } else { /* * Yes. Sleep until pages need to be reclaimed or * have their reference stats updated. */ if (mtx_sleep(&vm_pageout_wanted, &vm_page_queue_free_mtx, PDROP | PVM, "psleep", hz) == 0) { - PCPU_INC(cnt.v_pdwakeups); + VM_CNT_INC(v_pdwakeups); pass = 1; } else pass = 0; } target_met = vm_pageout_scan(domain, pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init(void) { /* * Initialize some paging parameters. */ vm_cnt.v_interrupt_free_min = 2; if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vm_cnt.v_page_count > 1024) vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else vm_cnt.v_free_min = 4; vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vm_cnt.v_interrupt_free_min; vm_cnt.v_free_reserved = vm_pageout_page_count + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; vm_cnt.v_free_min += vm_cnt.v_free_reserved; vm_cnt.v_free_severe += vm_cnt.v_free_reserved; vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vm_background_launder_target = (vm_cnt.v_free_target - vm_cnt.v_free_min) / 10; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; #ifdef VM_NUMA_ALLOC int i; #endif swap_pager_swap_init(); error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, 0, 0, "laundry: dom0"); if (error != 0) panic("starting laundry for domain 0, error %d", error); #ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } } #endif error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Unless the free page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &vm_cnt.v_free_count following this function unless * the free page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup(void) { if (!vm_pageout_wanted && curthread->td_proc != pageproc) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { vm_pageout_map_deactivate_pages( &vm->vm_map, limit); size = vmspace_resident_count(vm); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_pageout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) goto again; } } #endif /* !defined(NO_SWAPPING) */ Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 317060) +++ head/sys/vm/vnode_pager.c (revision 317061) @@ -1,1422 +1,1422 @@ /*- * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; int vnode_pbuf_freecnt; int vnode_async_pbuf_freecnt; /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if (!(object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(object); return (0); } VOP_UNLOCK(vp, 0); vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } if (size == 0) { if (vn_isdisk(vp, NULL)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ VM_OBJECT_WLOCK(object); object->ref_count--; VM_OBJECT_WUNLOCK(object); vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { /* * don't double-terminate the object */ if ((obj->flags & OBJ_DEAD) == 0) { vm_object_terminate(obj); } else { /* * Waiters were already handled during object * termination. The exclusive vnode lock hopefully * prevented new waiters from referencing the dying * object. */ KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0, ("OBJ_DISCONNECTWNT set obj %p flags %x", obj, obj->flags)); vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); } } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. * * MPSAFE */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; /* * If the object is being terminated, wait for it to * go away. */ retry: while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) break; vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0); } KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference")); if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->handle = handle; VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were sleeping */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; object->ref_count = 0; VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); } else { object->ref_count++; #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif VM_OBJECT_WUNLOCK(object); } vrefact(vp); return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; if (object->flags & OBJ_DISCONNECTWNT) { vm_object_clear_flag(object, OBJ_DISCONNECTWNT); wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); while (refs-- > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_WLOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } VM_OBJECT_WUNLOCK(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_WLOCK(object); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; /* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */ VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && m->valid != 0) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } } object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) m->valid = VM_PAGE_BITS_ALL; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VM_OBJECT_WLOCK(object); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); VM_OBJECT_WLOCK(object); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock, *freecnt; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= sizeof(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { - PCPU_INC(cnt.v_vnodein); - PCPU_INC(cnt.v_vnodepgsin); + VM_CNT_INC(v_vnodein); + VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { relpbuf(bp, freecnt); for (i = 0; i < count; i++) { - PCPU_INC(cnt.v_vnodein); - PCPU_INC(cnt.v_vnodepgsin); + VM_CNT_INC(v_vnodein); + VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); relpbuf(bp, freecnt); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > nitems(bp->b_pages)) { int trim, sum; trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= nitems(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page) continue; KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex == j - prev, ("%s: pages array not consecutive, bp %p", __func__, bp)); prev = j; } #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); - PCPU_INC(cnt.v_vnodein); - PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_async_pbuf_freecnt); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } VM_OBJECT_WLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_WUNLOCK(object); if (error != 0) printf("%s: I/O read error %d\n", __func__, error); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int count, error, i, maxsize, ncount, ppscheck; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n", (long)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page locked we are free to fix-up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_WLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { int pgoff; maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } } } VM_OBJECT_WUNLOCK(object); aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_td = (struct thread *) 0; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); - PCPU_INC(cnt.v_vnodeout); - PCPU_ADD(cnt.v_vnodepgsout, ncount); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); ppscheck = 0; if (error) { if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1))) printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { if (ppscheck || ppsratecheck(&lastfail, &curfail, 1)) printf("vnode_pager_putpages: residual I/O %zd at %lu\n", auio.uio_resid, (u_long)ma[0]->pindex); } for (i = 0; i < ncount; i++) { rtvals[i] = VM_PAGER_OK; } return rtvals[0]; } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) { vm_object_t obj; int i, pos; if (written == 0) return; obj = ma[0]->object; VM_OBJECT_WLOCK(obj); for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } VM_OBJECT_WUNLOCK(obj); } void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp, 0); vdrop(vp); if (mp != NULL) vn_finished_write(mp); } Index: head/sys/x86/acpica/srat.c =================================================================== --- head/sys/x86/acpica/srat.c (revision 317060) +++ head/sys/x86/acpica/srat.c (revision 317061) @@ -1,539 +1,540 @@ /*- * Copyright (c) 2010 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if MAXMEMDOM > 1 struct cpu_info { int enabled:1; int has_memory:1; int domain; } cpus[MAX_APIC_ID + 1]; struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; int num_mem; static ACPI_TABLE_SRAT *srat; static vm_paddr_t srat_physaddr; static int domain_pxm[MAXMEMDOM]; static int ndomain; static ACPI_TABLE_SLIT *slit; static vm_paddr_t slit_physaddr; static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; static void srat_walk_table(acpi_subtable_handler *handler, void *arg); /* * SLIT parsing. */ static void slit_parse_table(ACPI_TABLE_SLIT *s) { int i, j; int i_domain, j_domain; int offset = 0; uint8_t e; /* * This maps the SLIT data into the VM-domain centric view. * There may be sparse entries in the PXM namespace, so * remap them to a VM-domain ID and if it doesn't exist, * skip it. * * It should result in a packed 2d array of VM-domain * locality information entries. */ if (bootverbose) printf("SLIT.Localities: %d\n", (int) s->LocalityCount); for (i = 0; i < s->LocalityCount; i++) { i_domain = acpi_map_pxm_to_vm_domainid(i); if (i_domain < 0) continue; if (bootverbose) printf("%d: ", i); for (j = 0; j < s->LocalityCount; j++) { j_domain = acpi_map_pxm_to_vm_domainid(j); if (j_domain < 0) continue; e = s->Entry[i * s->LocalityCount + j]; if (bootverbose) printf("%d ", (int) e); /* 255 == "no locality information" */ if (e == 255) vm_locality_table[offset] = -1; else vm_locality_table[offset] = e; offset++; } if (bootverbose) printf("\n"); } } /* * Look for an ACPI System Locality Distance Information Table ("SLIT") */ static int parse_slit(void) { if (resource_disabled("slit", 0)) { return (-1); } slit_physaddr = acpi_find_table(ACPI_SIG_SLIT); if (slit_physaddr == 0) { return (-1); } /* * Make a pass over the table to populate the cpus[] and * mem_info[] tables. */ slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT); slit_parse_table(slit); acpi_unmap_table(slit); slit = NULL; #ifdef VM_NUMA_ALLOC /* Tell the VM about it! */ mem_locality = vm_locality_table; #endif return (0); } /* * SRAT parsing. */ /* * Returns true if a memory range overlaps with at least one range in * phys_avail[]. */ static int overlaps_phys_avail(vm_paddr_t start, vm_paddr_t end) { int i; for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] < start) continue; if (phys_avail[i] < end) return (1); break; } return (0); } static void srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_SRAT_CPU_AFFINITY *cpu; ACPI_SRAT_X2APIC_CPU_AFFINITY *x2apic; ACPI_SRAT_MEM_AFFINITY *mem; int domain, i, slot; switch (entry->Type) { case ACPI_SRAT_TYPE_CPU_AFFINITY: cpu = (ACPI_SRAT_CPU_AFFINITY *)entry; domain = cpu->ProximityDomainLo | cpu->ProximityDomainHi[0] << 8 | cpu->ProximityDomainHi[1] << 16 | cpu->ProximityDomainHi[2] << 24; if (bootverbose) printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", cpu->ApicId, domain, (cpu->Flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled"); if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) break; if (cpu->ApicId > MAX_APIC_ID) { printf("SRAT: Ignoring local APIC ID %u (too high)\n", cpu->ApicId); break; } if (cpus[cpu->ApicId].enabled) { printf("SRAT: Duplicate local APIC ID %u\n", cpu->ApicId); *(int *)arg = ENXIO; break; } cpus[cpu->ApicId].domain = domain; cpus[cpu->ApicId].enabled = 1; break; case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: x2apic = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)entry; if (bootverbose) printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", x2apic->ApicId, x2apic->ProximityDomain, (x2apic->Flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled"); if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED)) break; if (x2apic->ApicId > MAX_APIC_ID) { printf("SRAT: Ignoring local APIC ID %u (too high)\n", x2apic->ApicId); break; } KASSERT(!cpus[x2apic->ApicId].enabled, ("Duplicate local APIC ID %u", x2apic->ApicId)); cpus[x2apic->ApicId].domain = x2apic->ProximityDomain; cpus[x2apic->ApicId].enabled = 1; break; case ACPI_SRAT_TYPE_MEMORY_AFFINITY: mem = (ACPI_SRAT_MEM_AFFINITY *)entry; if (bootverbose) printf( "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n", mem->ProximityDomain, (uintmax_t)mem->BaseAddress, (uintmax_t)mem->Length, (mem->Flags & ACPI_SRAT_MEM_ENABLED) ? "enabled" : "disabled"); if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) break; if (!overlaps_phys_avail(mem->BaseAddress, mem->BaseAddress + mem->Length)) { printf("SRAT: Ignoring memory at addr 0x%jx\n", (uintmax_t)mem->BaseAddress); break; } if (num_mem == VM_PHYSSEG_MAX) { printf("SRAT: Too many memory regions\n"); *(int *)arg = ENXIO; break; } slot = num_mem; for (i = 0; i < num_mem; i++) { if (mem_info[i].end <= mem->BaseAddress) continue; if (mem_info[i].start < (mem->BaseAddress + mem->Length)) { printf("SRAT: Overlapping memory entries\n"); *(int *)arg = ENXIO; return; } slot = i; } for (i = num_mem; i > slot; i--) mem_info[i] = mem_info[i - 1]; mem_info[slot].start = mem->BaseAddress; mem_info[slot].end = mem->BaseAddress + mem->Length; mem_info[slot].domain = mem->ProximityDomain; num_mem++; break; } } /* * Ensure each memory domain has at least one CPU and that each CPU * has at least one memory domain. */ static int check_domains(void) { int found, i, j; for (i = 0; i < num_mem; i++) { found = 0; for (j = 0; j <= MAX_APIC_ID; j++) if (cpus[j].enabled && cpus[j].domain == mem_info[i].domain) { cpus[j].has_memory = 1; found++; } if (!found) { printf("SRAT: No CPU found for memory domain %d\n", mem_info[i].domain); return (ENXIO); } } for (i = 0; i <= MAX_APIC_ID; i++) if (cpus[i].enabled && !cpus[i].has_memory) { printf("SRAT: No memory found for CPU %d\n", i); return (ENXIO); } return (0); } /* * Check that the SRAT memory regions cover all of the regions in * phys_avail[]. */ static int check_phys_avail(void) { vm_paddr_t address; int i, j; /* j is the current offset into phys_avail[]. */ address = phys_avail[0]; j = 0; for (i = 0; i < num_mem; i++) { /* * Consume as many phys_avail[] entries as fit in this * region. */ while (address >= mem_info[i].start && address <= mem_info[i].end) { /* * If we cover the rest of this phys_avail[] entry, * advance to the next entry. */ if (phys_avail[j + 1] <= mem_info[i].end) { j += 2; if (phys_avail[j] == 0 && phys_avail[j + 1] == 0) { return (0); } address = phys_avail[j]; } else address = mem_info[i].end + 1; } } printf("SRAT: No memory region found for 0x%jx - 0x%jx\n", (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]); return (ENXIO); } /* * Renumber the memory domains to be compact and zero-based if not * already. Returns an error if there are too many domains. */ static int renumber_domains(void) { int i, j, slot; /* Enumerate all the domains. */ ndomain = 0; for (i = 0; i < num_mem; i++) { /* See if this domain is already known. */ for (j = 0; j < ndomain; j++) { if (domain_pxm[j] >= mem_info[i].domain) break; } if (j < ndomain && domain_pxm[j] == mem_info[i].domain) continue; if (ndomain >= MAXMEMDOM) { ndomain = 1; printf("SRAT: Too many memory domains\n"); return (EFBIG); } /* Insert the new domain at slot 'j'. */ slot = j; for (j = ndomain; j > slot; j--) domain_pxm[j] = domain_pxm[j - 1]; domain_pxm[slot] = mem_info[i].domain; ndomain++; } /* Renumber each domain to its index in the sorted 'domain_pxm' list. */ for (i = 0; i < ndomain; i++) { /* * If the domain is already the right value, no need * to renumber. */ if (domain_pxm[i] == i) continue; /* Walk the cpu[] and mem_info[] arrays to renumber. */ for (j = 0; j < num_mem; j++) if (mem_info[j].domain == domain_pxm[i]) mem_info[j].domain = i; for (j = 0; j <= MAX_APIC_ID; j++) if (cpus[j].enabled && cpus[j].domain == domain_pxm[i]) cpus[j].domain = i; } return (0); } /* * Look for an ACPI System Resource Affinity Table ("SRAT") */ static int parse_srat(void) { int error; if (resource_disabled("srat", 0)) return (-1); srat_physaddr = acpi_find_table(ACPI_SIG_SRAT); if (srat_physaddr == 0) return (-1); /* * Make a pass over the table to populate the cpus[] and * mem_info[] tables. */ srat = acpi_map_table(srat_physaddr, ACPI_SIG_SRAT); error = 0; srat_walk_table(srat_parse_entry, &error); acpi_unmap_table(srat); srat = NULL; if (error || check_domains() != 0 || check_phys_avail() != 0 || renumber_domains() != 0) { srat_physaddr = 0; return (-1); } #ifdef VM_NUMA_ALLOC /* Point vm_phys at our memory affinity table. */ vm_ndomains = ndomain; mem_affinity = mem_info; #endif return (0); } static void init_mem_locality(void) { int i; /* * For now, assume -1 == "no locality information for * this pairing. */ for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++) vm_locality_table[i] = -1; } static void parse_acpi_tables(void *dummy) { if (parse_srat() < 0) return; init_mem_locality(); (void) parse_slit(); } SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables, NULL); static void srat_walk_table(acpi_subtable_handler *handler, void *arg) { acpi_walk_subtables(srat + 1, (char *)srat + srat->Header.Length, handler, arg); } /* * Setup per-CPU domain IDs. */ static void srat_set_cpus(void *dummy) { struct cpu_info *cpu; struct pcpu *pc; u_int i; if (srat_physaddr == 0) return; for (i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); cpu = &cpus[pc->pc_apic_id]; if (!cpu->enabled) panic("SRAT: CPU with APIC ID %u is not known", pc->pc_apic_id); pc->pc_domain = cpu->domain; CPU_SET(i, &cpuset_domain[cpu->domain]); if (bootverbose) printf("SRAT: CPU %u has memory domain %d\n", i, cpu->domain); } } SYSINIT(srat_set_cpus, SI_SUB_CPU, SI_ORDER_ANY, srat_set_cpus, NULL); /* * Map a _PXM value to a VM domain ID. * * Returns the domain ID, or -1 if no domain ID was found. */ int acpi_map_pxm_to_vm_domainid(int pxm) { int i; for (i = 0; i < ndomain; i++) { if (domain_pxm[i] == pxm) return (i); } return (-1); } #else /* MAXMEMDOM == 1 */ int acpi_map_pxm_to_vm_domainid(int pxm) { return (-1); } #endif /* MAXMEMDOM > 1 */ Index: head/sys/x86/x86/intr_machdep.c =================================================================== --- head/sys/x86/x86/intr_machdep.c (revision 317060) +++ head/sys/x86/x86/intr_machdep.c (revision 317061) @@ -1,601 +1,601 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Machine dependent interrupt code for x86. For x86, we have to * deal with different PICs. Thus, we use the passed in vector to lookup * an interrupt source associated with that vector. The interrupt source * describes which PIC the source belongs to and includes methods to handle * that source. */ #include "opt_atpic.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifndef DEV_ATPIC #include #include #include #include #include #endif #define MAX_STRAY_LOG 5 typedef void (*mask_fn)(void *); static int intrcnt_index; static struct intsrc *interrupt_sources[NUM_IO_INTS]; static struct sx intrsrc_lock; static struct mtx intrpic_lock; static struct mtx intrcnt_lock; static TAILQ_HEAD(pics_head, pic) pics; #if defined(SMP) && !defined(EARLY_AP_STARTUP) static int assign_cpu; #endif u_long intrcnt[INTRCNT_COUNT]; char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)]; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); static int intr_assign_cpu(void *arg, int cpu); static void intr_disable_src(void *arg); static void intr_init(void *__dummy); static int intr_pic_registered(struct pic *pic); static void intrcnt_setname(const char *name, int index); static void intrcnt_updatename(struct intsrc *is); static void intrcnt_register(struct intsrc *is); static int intr_pic_registered(struct pic *pic) { struct pic *p; TAILQ_FOREACH(p, &pics, pics) { if (p == pic) return (1); } return (0); } /* * Register a new interrupt controller (PIC). This is to support suspend * and resume where we suspend/resume controllers rather than individual * sources. This also allows controllers with no active sources (such as * 8259As in a system using the APICs) to participate in suspend and resume. */ int intr_register_pic(struct pic *pic) { int error; mtx_lock(&intrpic_lock); if (intr_pic_registered(pic)) error = EBUSY; else { TAILQ_INSERT_TAIL(&pics, pic, pics); error = 0; } mtx_unlock(&intrpic_lock); return (error); } /* * Register a new interrupt source with the global interrupt system. * The global interrupts need to be disabled when this function is * called. */ int intr_register_source(struct intsrc *isrc) { int error, vector; KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC")); vector = isrc->is_pic->pic_vector(isrc); if (interrupt_sources[vector] != NULL) return (EEXIST); error = intr_event_create(&isrc->is_event, isrc, 0, vector, intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source, (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:", vector); if (error) return (error); sx_xlock(&intrsrc_lock); if (interrupt_sources[vector] != NULL) { sx_xunlock(&intrsrc_lock); intr_event_destroy(isrc->is_event); return (EEXIST); } intrcnt_register(isrc); interrupt_sources[vector] = isrc; isrc->is_handlers = 0; sx_xunlock(&intrsrc_lock); return (0); } struct intsrc * intr_lookup_source(int vector) { return (interrupt_sources[vector]); } int intr_add_handler(const char *name, int vector, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) { struct intsrc *isrc; int error; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); error = intr_event_add_handler(isrc->is_event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); if (error == 0) { sx_xlock(&intrsrc_lock); intrcnt_updatename(isrc); isrc->is_handlers++; if (isrc->is_handlers == 1) { isrc->is_pic->pic_enable_intr(isrc); isrc->is_pic->pic_enable_source(isrc); } sx_xunlock(&intrsrc_lock); } return (error); } int intr_remove_handler(void *cookie) { struct intsrc *isrc; int error; isrc = intr_handler_source(cookie); error = intr_event_remove_handler(cookie); if (error == 0) { sx_xlock(&intrsrc_lock); isrc->is_handlers--; if (isrc->is_handlers == 0) { isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI); isrc->is_pic->pic_disable_intr(isrc); } intrcnt_updatename(isrc); sx_xunlock(&intrsrc_lock); } return (error); } int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol) { struct intsrc *isrc; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); return (isrc->is_pic->pic_config_intr(isrc, trig, pol)); } static void intr_disable_src(void *arg) { struct intsrc *isrc; isrc = arg; isrc->is_pic->pic_disable_source(isrc, PIC_EOI); } void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame) { struct intr_event *ie; int vector; /* * We count software interrupts when we process them. The * code here follows previous practice, but there's an * argument for counting hardware interrupts when they're * processed too. */ (*isrc->is_count)++; - PCPU_INC(cnt.v_intr); + VM_CNT_INC(v_intr); ie = isrc->is_event; /* * XXX: We assume that IRQ 0 is only used for the ISA timer * device (clk). */ vector = isrc->is_pic->pic_vector(isrc); if (vector == 0) clkintr_pending = 1; /* * For stray interrupts, mask and EOI the source, bump the * stray count, and log the condition. */ if (intr_event_handle(ie, frame) != 0) { isrc->is_pic->pic_disable_source(isrc, PIC_EOI); (*isrc->is_straycount)++; if (*isrc->is_straycount < MAX_STRAY_LOG) log(LOG_ERR, "stray irq%d\n", vector); else if (*isrc->is_straycount == MAX_STRAY_LOG) log(LOG_CRIT, "too many stray irq %d's: not logging anymore\n", vector); } } void intr_resume(bool suspend_cancelled) { struct pic *pic; #ifndef DEV_ATPIC atpic_reset(); #endif mtx_lock(&intrpic_lock); TAILQ_FOREACH(pic, &pics, pics) { if (pic->pic_resume != NULL) pic->pic_resume(pic, suspend_cancelled); } mtx_unlock(&intrpic_lock); } void intr_suspend(void) { struct pic *pic; mtx_lock(&intrpic_lock); TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) { if (pic->pic_suspend != NULL) pic->pic_suspend(pic); } mtx_unlock(&intrpic_lock); } static int intr_assign_cpu(void *arg, int cpu) { #ifdef SMP struct intsrc *isrc; int error; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); if (cpu != NOCPU) { #else /* * Don't do anything during early boot. We will pick up the * assignment once the APs are started. */ if (assign_cpu && cpu != NOCPU) { #endif isrc = arg; sx_xlock(&intrsrc_lock); error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]); sx_xunlock(&intrsrc_lock); } else error = 0; return (error); #else return (EOPNOTSUPP); #endif } static void intrcnt_setname(const char *name, int index) { snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s", MAXCOMLEN, name); } static void intrcnt_updatename(struct intsrc *is) { intrcnt_setname(is->is_event->ie_fullname, is->is_index); } static void intrcnt_register(struct intsrc *is) { char straystr[MAXCOMLEN + 1]; KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__)); mtx_lock_spin(&intrcnt_lock); is->is_index = intrcnt_index; intrcnt_index += 2; snprintf(straystr, MAXCOMLEN + 1, "stray irq%d", is->is_pic->pic_vector(is)); intrcnt_updatename(is); is->is_count = &intrcnt[is->is_index]; intrcnt_setname(straystr, is->is_index + 1); is->is_straycount = &intrcnt[is->is_index + 1]; mtx_unlock_spin(&intrcnt_lock); } void intrcnt_add(const char *name, u_long **countp) { mtx_lock_spin(&intrcnt_lock); *countp = &intrcnt[intrcnt_index]; intrcnt_setname(name, intrcnt_index); intrcnt_index++; mtx_unlock_spin(&intrcnt_lock); } static void intr_init(void *dummy __unused) { intrcnt_setname("???", 0); intrcnt_index = 1; TAILQ_INIT(&pics); mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF); sx_init(&intrsrc_lock, "intrsrc"); mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN); } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); static void intr_init_final(void *dummy __unused) { /* * Enable interrupts on the BSP after all of the interrupt * controllers are initialized. Device interrupts are still * disabled in the interrupt controllers until interrupt * handlers are registered. Interrupts are enabled on each AP * after their first context switch. */ enable_intr(); } SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL); #ifndef DEV_ATPIC /* Initialize the two 8259A's to a known-good shutdown state. */ void atpic_reset(void) { outb(IO_ICU1, ICW1_RESET | ICW1_IC4); outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS); outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID)); outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE); outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff); outb(IO_ICU1, OCW3_SEL | OCW3_RR); outb(IO_ICU2, ICW1_RESET | ICW1_IC4); outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8); outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID); outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE); outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff); outb(IO_ICU2, OCW3_SEL | OCW3_RR); } #endif /* Add a description to an active interrupt handler. */ int intr_describe(u_int vector, void *ih, const char *descr) { struct intsrc *isrc; int error; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); error = intr_event_describe_handler(isrc->is_event, ih, descr); if (error) return (error); intrcnt_updatename(isrc); return (0); } void intr_reprogram(void) { struct intsrc *is; int v; sx_xlock(&intrsrc_lock); for (v = 0; v < NUM_IO_INTS; v++) { is = interrupt_sources[v]; if (is == NULL) continue; if (is->is_pic->pic_reprogram_pin != NULL) is->is_pic->pic_reprogram_pin(is); } sx_xunlock(&intrsrc_lock); } #ifdef DDB /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(irqs, db_show_irqs) { struct intsrc **isrc; int i, verbose; if (strcmp(modif, "v") == 0) verbose = 1; else verbose = 0; isrc = interrupt_sources; for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++) if (*isrc != NULL) db_dump_intr_event((*isrc)->is_event, verbose); } #endif #ifdef SMP /* * Support for balancing interrupt sources across CPUs. For now we just * allocate CPUs round-robin. */ cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1); static int current_cpu; /* * Return the CPU that the next interrupt source should use. For now * this just returns the next local APIC according to round-robin. */ u_int intr_next_cpu(void) { u_int apic_id; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #else /* Leave all interrupts on the BSP during boot. */ if (!assign_cpu) return (PCPU_GET(apic_id)); #endif mtx_lock_spin(&icu_lock); apic_id = cpu_apic_ids[current_cpu]; do { current_cpu++; if (current_cpu > mp_maxid) current_cpu = 0; } while (!CPU_ISSET(current_cpu, &intr_cpus)); mtx_unlock_spin(&icu_lock); return (apic_id); } /* Attempt to bind the specified IRQ to the specified CPU. */ int intr_bind(u_int vector, u_char cpu) { struct intsrc *isrc; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); return (intr_event_bind(isrc->is_event, cpu)); } /* * Add a CPU to our mask of valid CPUs that can be destinations of * interrupts. */ void intr_add_cpu(u_int cpu) { if (cpu >= MAXCPU) panic("%s: Invalid CPU ID", __func__); if (bootverbose) printf("INTR: Adding local APIC %d as a target\n", cpu_apic_ids[cpu]); CPU_SET(cpu, &intr_cpus); } #ifndef EARLY_AP_STARTUP /* * Distribute all the interrupt sources among the available CPUs once the * AP's have been launched. */ static void intr_shuffle_irqs(void *arg __unused) { struct intsrc *isrc; int i; /* Don't bother on UP. */ if (mp_ncpus == 1) return; /* Round-robin assign a CPU to each enabled source. */ sx_xlock(&intrsrc_lock); assign_cpu = 1; for (i = 0; i < NUM_IO_INTS; i++) { isrc = interrupt_sources[i]; if (isrc != NULL && isrc->is_handlers > 0) { /* * If this event is already bound to a CPU, * then assign the source to that CPU instead * of picking one via round-robin. Note that * this is careful to only advance the * round-robin if the CPU assignment succeeds. */ if (isrc->is_event->ie_cpu != NOCPU) (void)isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[isrc->is_event->ie_cpu]); else if (isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[current_cpu]) == 0) (void)intr_next_cpu(); } } sx_xunlock(&intrsrc_lock); } SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); #endif #else /* * Always route interrupts to the current processor in the UP case. */ u_int intr_next_cpu(void) { return (PCPU_GET(apic_id)); } #endif Index: head/usr.bin/top/machine.c =================================================================== --- head/usr.bin/top/machine.c (revision 317060) +++ head/usr.bin/top/machine.c (revision 317061) @@ -1,1680 +1,1680 @@ /* * top - a top users display for Unix * * SYNOPSIS: For FreeBSD-2.x and later * * DESCRIPTION: * Originally written for BSD4.4 system by Christos Zoulas. * Ported to FreeBSD 2.x by Steven Wallace && Wolfram Schneider * Order support hacked in from top-3.5beta6/machine/m_aix41.c * by Monte Mitzelfelt (for latest top see http://www.groupsys.com/topinfo/) * * This is the machine-dependent module for FreeBSD 2.2 * Works for: * FreeBSD 2.2.x, 3.x, 4.x, and probably FreeBSD 2.1.x * * LIBS: -lkvm * * AUTHOR: Christos Zoulas * Steven Wallace * Wolfram Schneider * Thomas Moestl * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "top.h" #include "machine.h" #include "screen.h" #include "utils.h" #include "layout.h" #define GETSYSCTL(name, var) getsysctl(name, &(var), sizeof(var)) #define SMPUNAMELEN 13 #define UPUNAMELEN 15 extern struct process_select ps; extern char* printable(char *); static int smpmode; enum displaymodes displaymode; #ifdef TOP_USERNAME_LEN static int namelength = TOP_USERNAME_LEN; #else static int namelength = 8; #endif /* TOP_JID_LEN based on max of 999999 */ #define TOP_JID_LEN 7 #define TOP_SWAP_LEN 6 static int jidlength; static int swaplength; static int cmdlengthdelta; /* Prototypes for top internals */ void quit(int); /* get_process_info passes back a handle. This is what it looks like: */ struct handle { struct kinfo_proc **next_proc; /* points to next valid proc pointer */ int remaining; /* number of pointers remaining */ }; /* declarations for load_avg */ #include "loadavg.h" /* define what weighted cpu is. */ #define weighted_cpu(pct, pp) ((pp)->ki_swtime == 0 ? 0.0 : \ ((pct) / (1.0 - exp((pp)->ki_swtime * logcpu)))) /* what we consider to be process size: */ #define PROCSIZE(pp) ((pp)->ki_size / 1024) #define RU(pp) (&(pp)->ki_rusage) #define RUTOT(pp) \ (RU(pp)->ru_inblock + RU(pp)->ru_oublock + RU(pp)->ru_majflt) #define PCTCPU(pp) (pcpu[pp - pbase]) /* definitions for indices in the nlist array */ /* * These definitions control the format of the per-process area */ static char io_header[] = " PID%*s %-*.*s VCSW IVCSW READ WRITE FAULT TOTAL PERCENT COMMAND"; #define io_Proc_format \ "%5d%*s %-*.*s %6ld %6ld %6ld %6ld %6ld %6ld %6.2f%% %.*s" static char smp_header_thr[] = " PID%*s %-*.*s THR PRI NICE SIZE RES%*s STATE C TIME %7s COMMAND"; static char smp_header[] = " PID%*s %-*.*s " "PRI NICE SIZE RES%*s STATE C TIME %7s COMMAND"; #define smp_Proc_format \ "%5d%*s %-*.*s %s%3d %4s%7s %6s%*.*s %-6.6s %2d%7s %6.2f%% %.*s" static char up_header_thr[] = " PID%*s %-*.*s THR PRI NICE SIZE RES%*s STATE TIME %7s COMMAND"; static char up_header[] = " PID%*s %-*.*s " "PRI NICE SIZE RES%*s STATE TIME %7s COMMAND"; #define up_Proc_format \ "%5d%*s %-*.*s %s%3d %4s%7s %6s%*.*s %-6.6s%.0d%7s %6.2f%% %.*s" /* process state names for the "STATE" column of the display */ /* the extra nulls in the string "run" are for adding a slash and the processor number when needed */ char *state_abbrev[] = { "", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", "WAIT", "LOCK" }; static kvm_t *kd; /* values that we stash away in _init and use in later routines */ static double logcpu; /* these are retrieved from the kernel in _init */ static load_avg ccpu; /* these are used in the get_ functions */ static int lastpid; /* these are for calculating cpu state percentages */ static long cp_time[CPUSTATES]; static long cp_old[CPUSTATES]; static long cp_diff[CPUSTATES]; /* these are for detailing the process states */ int process_states[8]; char *procstatenames[] = { "", " starting, ", " running, ", " sleeping, ", " stopped, ", " zombie, ", " waiting, ", " lock, ", NULL }; /* these are for detailing the cpu states */ int cpu_states[CPUSTATES]; char *cpustatenames[] = { "user", "nice", "system", "interrupt", "idle", NULL }; /* these are for detailing the memory statistics */ int memory_stats[7]; char *memorynames[] = { "K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ", "K Free", NULL }; int arc_stats[7]; char *arcnames[] = { "K Total, ", "K MFU, ", "K MRU, ", "K Anon, ", "K Header, ", "K Other", NULL }; int carc_stats[5]; char *carcnames[] = { "K Compressed, ", "K Uncompressed, ", ":1 Ratio, ", "K Overhead", NULL }; int swap_stats[7]; char *swapnames[] = { "K Total, ", "K Used, ", "K Free, ", "% Inuse, ", "K In, ", "K Out", NULL }; /* these are for keeping track of the proc array */ static int nproc; static int onproc = -1; static int pref_len; static struct kinfo_proc *pbase; static struct kinfo_proc **pref; static struct kinfo_proc *previous_procs; static struct kinfo_proc **previous_pref; static int previous_proc_count = 0; static int previous_proc_count_max = 0; static int previous_thread; /* data used for recalculating pctcpu */ static double *pcpu; static struct timespec proc_uptime; static struct timeval proc_wall_time; static struct timeval previous_wall_time; static uint64_t previous_interval = 0; /* total number of io operations */ static long total_inblock; static long total_oublock; static long total_majflt; /* these are for getting the memory statistics */ static int arc_enabled; static int carc_enabled; static int pageshift; /* log base 2 of the pagesize */ /* define pagetok in terms of pageshift */ #define pagetok(size) ((size) << pageshift) /* swap usage */ #define ki_swap(kip) \ ((kip)->ki_swrss > (kip)->ki_rssize ? (kip)->ki_swrss - (kip)->ki_rssize : 0) /* useful externals */ long percentages(int cnt, int *out, long *new, long *old, long *diffs); #ifdef ORDER /* * Sorting orders. The first element is the default. */ char *ordernames[] = { "cpu", "size", "res", "time", "pri", "threads", "total", "read", "write", "fault", "vcsw", "ivcsw", "jid", "swap", "pid", NULL }; #endif /* Per-cpu time states */ static int maxcpu; static int maxid; static int ncpus; static u_long cpumask; static long *times; static long *pcpu_cp_time; static long *pcpu_cp_old; static long *pcpu_cp_diff; static int *pcpu_cpu_states; static int compare_swap(const void *a, const void *b); static int compare_jid(const void *a, const void *b); static int compare_pid(const void *a, const void *b); static int compare_tid(const void *a, const void *b); static const char *format_nice(const struct kinfo_proc *pp); static void getsysctl(const char *name, void *ptr, size_t len); static int swapmode(int *retavail, int *retfree); static void update_layout(void); void toggle_pcpustats(void) { if (ncpus == 1) return; update_layout(); } /* Adjust display based on ncpus and the ARC state. */ static void update_layout(void) { y_mem = 3; y_arc = 4; y_carc = 5; y_swap = 4 + arc_enabled + carc_enabled; y_idlecursor = 5 + arc_enabled + carc_enabled; y_message = 5 + arc_enabled + carc_enabled; y_header = 6 + arc_enabled + carc_enabled; y_procs = 7 + arc_enabled + carc_enabled; Header_lines = 7 + arc_enabled + carc_enabled; if (pcpu_stats) { y_mem += ncpus - 1; y_arc += ncpus - 1; y_carc += ncpus - 1; y_swap += ncpus - 1; y_idlecursor += ncpus - 1; y_message += ncpus - 1; y_header += ncpus - 1; y_procs += ncpus - 1; Header_lines += ncpus - 1; } } int machine_init(struct statics *statics, char do_unames) { int i, j, empty, pagesize; uint64_t arc_size; boolean_t carc_en; size_t size; struct passwd *pw; size = sizeof(smpmode); if ((sysctlbyname("machdep.smp_active", &smpmode, &size, NULL, 0) != 0 && sysctlbyname("kern.smp.active", &smpmode, &size, NULL, 0) != 0) || size != sizeof(smpmode)) smpmode = 0; size = sizeof(carc_en); if (sysctlbyname("vfs.zfs.compressed_arc_enabled", &carc_en, &size, NULL, 0) == 0 && carc_en == 1) carc_enabled = 1; size = sizeof(arc_size); if (sysctlbyname("kstat.zfs.misc.arcstats.size", &arc_size, &size, NULL, 0) == 0 && arc_size != 0) arc_enabled = 1; if (do_unames) { while ((pw = getpwent()) != NULL) { if (strlen(pw->pw_name) > namelength) namelength = strlen(pw->pw_name); } } if (smpmode && namelength > SMPUNAMELEN) namelength = SMPUNAMELEN; else if (namelength > UPUNAMELEN) namelength = UPUNAMELEN; kd = kvm_open(NULL, _PATH_DEVNULL, NULL, O_RDONLY, "kvm_open"); if (kd == NULL) return (-1); GETSYSCTL("kern.ccpu", ccpu); /* this is used in calculating WCPU -- calculate it ahead of time */ logcpu = log(loaddouble(ccpu)); pbase = NULL; pref = NULL; pcpu = NULL; nproc = 0; onproc = -1; /* get the page size and calculate pageshift from it */ pagesize = getpagesize(); pageshift = 0; while (pagesize > 1) { pageshift++; pagesize >>= 1; } /* we only need the amount of log(2)1024 for our conversion */ pageshift -= LOG1024; /* fill in the statics information */ statics->procstate_names = procstatenames; statics->cpustate_names = cpustatenames; statics->memory_names = memorynames; if (arc_enabled) statics->arc_names = arcnames; else statics->arc_names = NULL; if (carc_enabled) statics->carc_names = carcnames; else statics->carc_names = NULL; statics->swap_names = swapnames; #ifdef ORDER statics->order_names = ordernames; #endif /* Allocate state for per-CPU stats. */ cpumask = 0; ncpus = 0; GETSYSCTL("kern.smp.maxcpus", maxcpu); size = sizeof(long) * maxcpu * CPUSTATES; times = malloc(size); if (times == NULL) err(1, "malloc %zu bytes", size); if (sysctlbyname("kern.cp_times", times, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); pcpu_cp_time = calloc(1, size); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { cpumask |= (1ul << i); ncpus++; } } size = sizeof(long) * ncpus * CPUSTATES; pcpu_cp_old = calloc(1, size); pcpu_cp_diff = calloc(1, size); pcpu_cpu_states = calloc(1, size); statics->ncpus = ncpus; update_layout(); /* all done! */ return (0); } char * format_header(char *uname_field) { static char Header[128]; const char *prehead; if (ps.jail) jidlength = TOP_JID_LEN + 1; /* +1 for extra left space. */ else jidlength = 0; if (ps.swap) swaplength = TOP_SWAP_LEN + 1; /* +1 for extra left space */ else swaplength = 0; switch (displaymode) { case DISP_CPU: /* * The logic of picking the right header format seems reverse * here because we only want to display a THR column when * "thread mode" is off (and threads are not listed as * separate lines). */ prehead = smpmode ? (ps.thread ? smp_header : smp_header_thr) : (ps.thread ? up_header : up_header_thr); snprintf(Header, sizeof(Header), prehead, jidlength, ps.jail ? " JID" : "", namelength, namelength, uname_field, swaplength, ps.swap ? " SWAP" : "", ps.wcpu ? "WCPU" : "CPU"); break; case DISP_IO: prehead = io_header; snprintf(Header, sizeof(Header), prehead, jidlength, ps.jail ? " JID" : "", namelength, namelength, uname_field); break; } cmdlengthdelta = strlen(Header) - 7; return (Header); } static int swappgsin = -1; static int swappgsout = -1; extern struct timeval timeout; void get_system_info(struct system_info *si) { long total; struct loadavg sysload; int mib[2]; struct timeval boottime; uint64_t arc_stat, arc_stat2; int i, j; size_t size; /* get the CPU stats */ size = (maxid + 1) * CPUSTATES * sizeof(long); if (sysctlbyname("kern.cp_times", pcpu_cp_time, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); GETSYSCTL("kern.cp_time", cp_time); GETSYSCTL("vm.loadavg", sysload); GETSYSCTL("kern.lastpid", lastpid); /* convert load averages to doubles */ for (i = 0; i < 3; i++) si->load_avg[i] = (double)sysload.ldavg[i] / sysload.fscale; /* convert cp_time counts to percentages */ for (i = j = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; percentages(CPUSTATES, &pcpu_cpu_states[j * CPUSTATES], &pcpu_cp_time[j * CPUSTATES], &pcpu_cp_old[j * CPUSTATES], &pcpu_cp_diff[j * CPUSTATES]); j++; } percentages(CPUSTATES, cpu_states, cp_time, cp_old, cp_diff); /* sum memory & swap statistics */ { static unsigned int swap_delay = 0; static int swapavail = 0; static int swapfree = 0; static long bufspace = 0; - static int nspgsin, nspgsout; + static uint64_t nspgsin, nspgsout; GETSYSCTL("vfs.bufspace", bufspace); GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]); GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]); GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]); GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]); GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]); GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout); /* convert memory stats to Kbytes */ memory_stats[0] = pagetok(memory_stats[0]); memory_stats[1] = pagetok(memory_stats[1]); memory_stats[2] = pagetok(memory_stats[2]); memory_stats[3] = pagetok(memory_stats[3]); memory_stats[4] = bufspace / 1024; memory_stats[5] = pagetok(memory_stats[5]); memory_stats[6] = -1; /* first interval */ if (swappgsin < 0) { swap_stats[4] = 0; swap_stats[5] = 0; } /* compute differences between old and new swap statistic */ else { swap_stats[4] = pagetok(((nspgsin - swappgsin))); swap_stats[5] = pagetok(((nspgsout - swappgsout))); } swappgsin = nspgsin; swappgsout = nspgsout; /* call CPU heavy swapmode() only for changes */ if (swap_stats[4] > 0 || swap_stats[5] > 0 || swap_delay == 0) { swap_stats[3] = swapmode(&swapavail, &swapfree); swap_stats[0] = swapavail; swap_stats[1] = swapavail - swapfree; swap_stats[2] = swapfree; } swap_delay = 1; swap_stats[6] = -1; } if (arc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.size", arc_stat); arc_stats[0] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mfu_size", arc_stat); arc_stats[1] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mru_size", arc_stat); arc_stats[2] = arc_stat >> 10; GETSYSCTL("vfs.zfs.anon_size", arc_stat); arc_stats[3] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.hdr_size", arc_stat); GETSYSCTL("kstat.zfs.misc.arcstats.l2_hdr_size", arc_stat2); arc_stats[4] = arc_stat + arc_stat2 >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.other_size", arc_stat); arc_stats[5] = arc_stat >> 10; si->arc = arc_stats; } if (carc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.compressed_size", arc_stat); carc_stats[0] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.uncompressed_size", arc_stat); carc_stats[1] = arc_stat >> 10; carc_stats[2] = arc_stats[0]; /* ARC Total */ GETSYSCTL("kstat.zfs.misc.arcstats.overhead_size", arc_stat); carc_stats[3] = arc_stat >> 10; si->carc = carc_stats; } /* set arrays and strings */ if (pcpu_stats) { si->cpustates = pcpu_cpu_states; si->ncpus = ncpus; } else { si->cpustates = cpu_states; si->ncpus = 1; } si->memory = memory_stats; si->swap = swap_stats; if (lastpid > 0) { si->last_pid = lastpid; } else { si->last_pid = -1; } /* * Print how long system has been up. * (Found by looking getting "boottime" from the kernel) */ mib[0] = CTL_KERN; mib[1] = KERN_BOOTTIME; size = sizeof(boottime); if (sysctl(mib, nitems(mib), &boottime, &size, NULL, 0) != -1 && boottime.tv_sec != 0) { si->boottime = boottime; } else { si->boottime.tv_sec = -1; } } #define NOPROC ((void *)-1) /* * We need to compare data from the old process entry with the new * process entry. * To facilitate doing this quickly we stash a pointer in the kinfo_proc * structure to cache the mapping. We also use a negative cache pointer * of NOPROC to avoid duplicate lookups. * XXX: this could be done when the actual processes are fetched, we do * it here out of laziness. */ const struct kinfo_proc * get_old_proc(struct kinfo_proc *pp) { struct kinfo_proc **oldpp, *oldp; /* * If this is the first fetch of the kinfo_procs then we don't have * any previous entries. */ if (previous_proc_count == 0) return (NULL); /* negative cache? */ if (pp->ki_udata == NOPROC) return (NULL); /* cached? */ if (pp->ki_udata != NULL) return (pp->ki_udata); /* * Not cached, * 1) look up based on pid. * 2) compare process start. * If we fail here, then setup a negative cache entry, otherwise * cache it. */ oldpp = bsearch(&pp, previous_pref, previous_proc_count, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); if (oldpp == NULL) { pp->ki_udata = NOPROC; return (NULL); } oldp = *oldpp; if (bcmp(&oldp->ki_start, &pp->ki_start, sizeof(pp->ki_start)) != 0) { pp->ki_udata = NOPROC; return (NULL); } pp->ki_udata = oldp; return (oldp); } /* * Return the total amount of IO done in blocks in/out and faults. * store the values individually in the pointers passed in. */ long get_io_stats(struct kinfo_proc *pp, long *inp, long *oup, long *flp, long *vcsw, long *ivcsw) { const struct kinfo_proc *oldp; static struct kinfo_proc dummy; long ret; oldp = get_old_proc(pp); if (oldp == NULL) { bzero(&dummy, sizeof(dummy)); oldp = &dummy; } *inp = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; *oup = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; *flp = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; *vcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; *ivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; ret = (RU(pp)->ru_inblock - RU(oldp)->ru_inblock) + (RU(pp)->ru_oublock - RU(oldp)->ru_oublock) + (RU(pp)->ru_majflt - RU(oldp)->ru_majflt); return (ret); } /* * If there was a previous update, use the delta in ki_runtime over * the previous interval to calculate pctcpu. Otherwise, fall back * to using the kernel's ki_pctcpu. */ static double proc_calc_pctcpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; if (previous_interval != 0) { oldp = get_old_proc(pp); if (oldp != NULL) return ((double)(pp->ki_runtime - oldp->ki_runtime) / previous_interval); /* * If this process/thread was created during the previous * interval, charge it's total runtime to the previous * interval. */ else if (pp->ki_start.tv_sec > previous_wall_time.tv_sec || (pp->ki_start.tv_sec == previous_wall_time.tv_sec && pp->ki_start.tv_usec >= previous_wall_time.tv_usec)) return ((double)pp->ki_runtime / previous_interval); } return (pctdouble(pp->ki_pctcpu)); } /* * Return true if this process has used any CPU time since the * previous update. */ static int proc_used_cpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; oldp = get_old_proc(pp); if (oldp == NULL) return (PCTCPU(pp) != 0); return (pp->ki_runtime != oldp->ki_runtime || RU(pp)->ru_nvcsw != RU(oldp)->ru_nvcsw || RU(pp)->ru_nivcsw != RU(oldp)->ru_nivcsw); } /* * Return the total number of block in/out and faults by a process. */ long get_io_total(struct kinfo_proc *pp) { long dummy; return (get_io_stats(pp, &dummy, &dummy, &dummy, &dummy, &dummy)); } static struct handle handle; caddr_t get_process_info(struct system_info *si, struct process_select *sel, int (*compare)(const void *, const void *)) { int i; int total_procs; long p_io; long p_inblock, p_oublock, p_majflt, p_vcsw, p_ivcsw; long nsec; int active_procs; struct kinfo_proc **prefp; struct kinfo_proc *pp; struct timespec previous_proc_uptime; /* these are copied out of sel for speed */ int show_idle; int show_jid; int show_self; int show_system; int show_uid; int show_command; int show_kidle; /* * If thread state was toggled, don't cache the previous processes. */ if (previous_thread != sel->thread) nproc = 0; previous_thread = sel->thread; /* * Save the previous process info. */ if (previous_proc_count_max < nproc) { free(previous_procs); previous_procs = malloc(nproc * sizeof(*previous_procs)); free(previous_pref); previous_pref = malloc(nproc * sizeof(*previous_pref)); if (previous_procs == NULL || previous_pref == NULL) { (void) fprintf(stderr, "top: Out of memory.\n"); quit(23); } previous_proc_count_max = nproc; } if (nproc) { for (i = 0; i < nproc; i++) previous_pref[i] = &previous_procs[i]; bcopy(pbase, previous_procs, nproc * sizeof(*previous_procs)); qsort(previous_pref, nproc, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); } previous_proc_count = nproc; previous_proc_uptime = proc_uptime; previous_wall_time = proc_wall_time; previous_interval = 0; pbase = kvm_getprocs(kd, sel->thread ? KERN_PROC_ALL : KERN_PROC_PROC, 0, &nproc); (void)gettimeofday(&proc_wall_time, NULL); if (clock_gettime(CLOCK_UPTIME, &proc_uptime) != 0) memset(&proc_uptime, 0, sizeof(proc_uptime)); else if (previous_proc_uptime.tv_sec != 0 && previous_proc_uptime.tv_nsec != 0) { previous_interval = (proc_uptime.tv_sec - previous_proc_uptime.tv_sec) * 1000000; nsec = proc_uptime.tv_nsec - previous_proc_uptime.tv_nsec; if (nsec < 0) { previous_interval -= 1000000; nsec += 1000000000; } previous_interval += nsec / 1000; } if (nproc > onproc) { pref = realloc(pref, sizeof(*pref) * nproc); pcpu = realloc(pcpu, sizeof(*pcpu) * nproc); onproc = nproc; } if (pref == NULL || pbase == NULL || pcpu == NULL) { (void) fprintf(stderr, "top: Out of memory.\n"); quit(23); } /* get a pointer to the states summary array */ si->procstates = process_states; /* set up flags which define what we are going to select */ show_idle = sel->idle; show_jid = sel->jid != -1; show_self = sel->self == -1; show_system = sel->system; show_uid = sel->uid != -1; show_command = sel->command != NULL; show_kidle = sel->kidle; /* count up process states and get pointers to interesting procs */ total_procs = 0; active_procs = 0; total_inblock = 0; total_oublock = 0; total_majflt = 0; memset((char *)process_states, 0, sizeof(process_states)); prefp = pref; for (pp = pbase, i = 0; i < nproc; pp++, i++) { if (pp->ki_stat == 0) /* not in use */ continue; if (!show_self && pp->ki_pid == sel->self) /* skip self */ continue; if (!show_system && (pp->ki_flag & P_SYSTEM)) /* skip system process */ continue; p_io = get_io_stats(pp, &p_inblock, &p_oublock, &p_majflt, &p_vcsw, &p_ivcsw); total_inblock += p_inblock; total_oublock += p_oublock; total_majflt += p_majflt; total_procs++; process_states[pp->ki_stat]++; if (pp->ki_stat == SZOMB) /* skip zombies */ continue; if (!show_kidle && pp->ki_tdflags & TDF_IDLETD) /* skip kernel idle process */ continue; PCTCPU(pp) = proc_calc_pctcpu(pp); if (sel->thread && PCTCPU(pp) > 1.0) PCTCPU(pp) = 1.0; if (displaymode == DISP_CPU && !show_idle && (!proc_used_cpu(pp) || pp->ki_stat == SSTOP || pp->ki_stat == SIDL)) /* skip idle or non-running processes */ continue; if (displaymode == DISP_IO && !show_idle && p_io == 0) /* skip processes that aren't doing I/O */ continue; if (show_jid && pp->ki_jid != sel->jid) /* skip proc. that don't belong to the selected JID */ continue; if (show_uid && pp->ki_ruid != (uid_t)sel->uid) /* skip proc. that don't belong to the selected UID */ continue; *prefp++ = pp; active_procs++; } /* if requested, sort the "interesting" processes */ if (compare != NULL) qsort(pref, active_procs, sizeof(*pref), compare); /* remember active and total counts */ si->p_total = total_procs; si->p_active = pref_len = active_procs; /* pass back a handle */ handle.next_proc = pref; handle.remaining = active_procs; return ((caddr_t)&handle); } static char fmt[512]; /* static area where result is built */ char * format_next_process(caddr_t handle, char *(*get_userid)(int), int flags) { struct kinfo_proc *pp; const struct kinfo_proc *oldp; long cputime; double pct; struct handle *hp; char status[16]; int cpu, state; struct rusage ru, *rup; long p_tot, s_tot; char *proc_fmt, thr_buf[6]; char jid_buf[TOP_JID_LEN + 1], swap_buf[TOP_SWAP_LEN + 1]; char *cmdbuf = NULL; char **args; const int cmdlen = 128; /* find and remember the next proc structure */ hp = (struct handle *)handle; pp = *(hp->next_proc++); hp->remaining--; /* get the process's command name */ if ((pp->ki_flag & P_INMEM) == 0) { /* * Print swapped processes as */ size_t len; len = strlen(pp->ki_comm); if (len > sizeof(pp->ki_comm) - 3) len = sizeof(pp->ki_comm) - 3; memmove(pp->ki_comm + 1, pp->ki_comm, len); pp->ki_comm[0] = '<'; pp->ki_comm[len + 1] = '>'; pp->ki_comm[len + 2] = '\0'; } /* * Convert the process's runtime from microseconds to seconds. This * time includes the interrupt time although that is not wanted here. * ps(1) is similarly sloppy. */ cputime = (pp->ki_runtime + 500000) / 1000000; /* calculate the base for cpu percentages */ pct = PCTCPU(pp); /* generate "STATE" field */ switch (state = pp->ki_stat) { case SRUN: if (smpmode && pp->ki_oncpu != NOCPU) sprintf(status, "CPU%d", pp->ki_oncpu); else strcpy(status, "RUN"); break; case SLOCK: if (pp->ki_kiflag & KI_LOCKBLOCK) { sprintf(status, "*%.6s", pp->ki_lockname); break; } /* fall through */ case SSLEEP: if (pp->ki_wmesg != NULL) { sprintf(status, "%.6s", pp->ki_wmesg); break; } /* FALLTHROUGH */ default: if (state >= 0 && state < sizeof(state_abbrev) / sizeof(*state_abbrev)) sprintf(status, "%.6s", state_abbrev[state]); else sprintf(status, "?%5d", state); break; } cmdbuf = (char *)malloc(cmdlen + 1); if (cmdbuf == NULL) { warn("malloc(%d)", cmdlen + 1); return NULL; } if (!(flags & FMT_SHOWARGS)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, cmdlen, "%s{%s%s}", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, cmdlen, "%s", pp->ki_comm); } } else { if (pp->ki_flag & P_SYSTEM || pp->ki_args == NULL || (args = kvm_getargv(kd, pp, cmdlen)) == NULL || !(*args)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, cmdlen, "[%s{%s%s}]", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, cmdlen, "[%s]", pp->ki_comm); } } else { char *src, *dst, *argbuf; char *cmd; size_t argbuflen; size_t len; argbuflen = cmdlen * 4; argbuf = (char *)malloc(argbuflen + 1); if (argbuf == NULL) { warn("malloc(%zu)", argbuflen + 1); free(cmdbuf); return NULL; } dst = argbuf; /* Extract cmd name from argv */ cmd = strrchr(*args, '/'); if (cmd == NULL) cmd = *args; else cmd++; for (; (src = *args++) != NULL; ) { if (*src == '\0') continue; len = (argbuflen - (dst - argbuf) - 1) / 4; strvisx(dst, src, MIN(strlen(src), len), VIS_NL | VIS_CSTYLE); while (*dst != '\0') dst++; if ((argbuflen - (dst - argbuf) - 1) / 4 > 0) *dst++ = ' '; /* add delimiting space */ } if (dst != argbuf && dst[-1] == ' ') dst--; *dst = '\0'; if (strcmp(cmd, pp->ki_comm) != 0) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, cmdlen, "%s (%s){%s%s}", argbuf, pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); else snprintf(cmdbuf, cmdlen, "%s (%s)", argbuf, pp->ki_comm); } else { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, cmdlen, "%s{%s%s}", argbuf, pp->ki_tdname, pp->ki_moretdname); else strlcpy(cmdbuf, argbuf, cmdlen); } free(argbuf); } } if (ps.jail == 0) jid_buf[0] = '\0'; else snprintf(jid_buf, sizeof(jid_buf), "%*d", jidlength - 1, pp->ki_jid); if (ps.swap == 0) swap_buf[0] = '\0'; else snprintf(swap_buf, sizeof(swap_buf), "%*s", swaplength - 1, format_k2(pagetok(ki_swap(pp)))); /* XXX */ if (displaymode == DISP_IO) { oldp = get_old_proc(pp); if (oldp != NULL) { ru.ru_inblock = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; ru.ru_oublock = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; ru.ru_majflt = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; ru.ru_nvcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; ru.ru_nivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; rup = &ru; } else { rup = RU(pp); } p_tot = rup->ru_inblock + rup->ru_oublock + rup->ru_majflt; s_tot = total_inblock + total_oublock + total_majflt; snprintf(fmt, sizeof(fmt), io_Proc_format, pp->ki_pid, jidlength, jid_buf, namelength, namelength, (*get_userid)(pp->ki_ruid), rup->ru_nvcsw, rup->ru_nivcsw, rup->ru_inblock, rup->ru_oublock, rup->ru_majflt, p_tot, s_tot == 0 ? 0.0 : (p_tot * 100.0 / s_tot), screen_width > cmdlengthdelta ? screen_width - cmdlengthdelta : 0, printable(cmdbuf)); free(cmdbuf); return (fmt); } /* format this entry */ if (smpmode) { if (state == SRUN && pp->ki_oncpu != NOCPU) cpu = pp->ki_oncpu; else cpu = pp->ki_lastcpu; } else cpu = 0; proc_fmt = smpmode ? smp_Proc_format : up_Proc_format; if (ps.thread != 0) thr_buf[0] = '\0'; else snprintf(thr_buf, sizeof(thr_buf), "%*d ", (int)(sizeof(thr_buf) - 2), pp->ki_numthreads); snprintf(fmt, sizeof(fmt), proc_fmt, pp->ki_pid, jidlength, jid_buf, namelength, namelength, (*get_userid)(pp->ki_ruid), thr_buf, pp->ki_pri.pri_level - PZERO, format_nice(pp), format_k2(PROCSIZE(pp)), format_k2(pagetok(pp->ki_rssize)), swaplength, swaplength, swap_buf, status, cpu, format_time(cputime), ps.wcpu ? 100.0 * weighted_cpu(pct, pp) : 100.0 * pct, screen_width > cmdlengthdelta ? screen_width - cmdlengthdelta : 0, printable(cmdbuf)); free(cmdbuf); /* return the result */ return (fmt); } static void getsysctl(const char *name, void *ptr, size_t len) { size_t nlen = len; if (sysctlbyname(name, ptr, &nlen, NULL, 0) == -1) { fprintf(stderr, "top: sysctl(%s...) failed: %s\n", name, strerror(errno)); quit(23); } if (nlen != len) { fprintf(stderr, "top: sysctl(%s...) expected %lu, got %lu\n", name, (unsigned long)len, (unsigned long)nlen); quit(23); } } static const char * format_nice(const struct kinfo_proc *pp) { const char *fifo, *kproc; int rtpri; static char nicebuf[4 + 1]; fifo = PRI_NEED_RR(pp->ki_pri.pri_class) ? "" : "F"; kproc = (pp->ki_flag & P_KPROC) ? "k" : ""; switch (PRI_BASE(pp->ki_pri.pri_class)) { case PRI_ITHD: return ("-"); case PRI_REALTIME: /* * XXX: the kernel doesn't tell us the original rtprio and * doesn't really know what it was, so to recover it we * must be more chummy with the implementation than the * implementation is with itself. pri_user gives a * constant "base" priority, but is only initialized * properly for user threads. pri_native gives what the * kernel calls the "base" priority, but it isn't constant * since it is changed by priority propagation. pri_native * also isn't properly initialized for all threads, but it * is properly initialized for kernel realtime and idletime * threads. Thus we use pri_user for the base priority of * user threads (it is always correct) and pri_native for * the base priority of kernel realtime and idletime threads * (there is nothing better, and it is usually correct). * * The field width and thus the buffer are too small for * values like "kr31F", but such values shouldn't occur, * and if they do then the tailing "F" is not displayed. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_REALTIME; snprintf(nicebuf, sizeof(nicebuf), "%sr%d%s", kproc, rtpri, fifo); break; case PRI_TIMESHARE: if (pp->ki_flag & P_KPROC) return ("-"); snprintf(nicebuf, sizeof(nicebuf), "%d", pp->ki_nice - NZERO); break; case PRI_IDLE: /* XXX: as above. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_IDLE; snprintf(nicebuf, sizeof(nicebuf), "%si%d%s", kproc, rtpri, fifo); break; default: return ("?"); } return (nicebuf); } /* comparison routines for qsort */ static int compare_pid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; if ((*pp2)->ki_pid < 0 || (*pp1)->ki_pid < 0) abort(); return ((*pp1)->ki_pid - (*pp2)->ki_pid); } static int compare_tid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; if ((*pp2)->ki_tid < 0 || (*pp1)->ki_tid < 0) abort(); return ((*pp1)->ki_tid - (*pp2)->ki_tid); } /* * proc_compare - comparison function for "qsort" * Compares the resource consumption of two processes using five * distinct keys. The keys (in descending order of importance) are: * percent cpu, cpu ticks, state, resident set size, total virtual * memory usage. The process states are ordered as follows (from least * to most important): WAIT, zombie, sleep, stop, start, run. The * array declaration below maps a process state index into a number * that reflects this ordering. */ static int sorted_state[] = { 0, /* not used */ 3, /* sleep */ 1, /* ABANDONED (WAIT) */ 6, /* run */ 5, /* start */ 2, /* zombie */ 4 /* stop */ }; #define ORDERKEY_PCTCPU(a, b) do { \ double diff; \ if (ps.wcpu) \ diff = weighted_cpu(PCTCPU((b)), (b)) - \ weighted_cpu(PCTCPU((a)), (a)); \ else \ diff = PCTCPU((b)) - PCTCPU((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_CPTICKS(a, b) do { \ int64_t diff = (int64_t)(b)->ki_runtime - (int64_t)(a)->ki_runtime; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_STATE(a, b) do { \ int diff = sorted_state[(b)->ki_stat] - sorted_state[(a)->ki_stat]; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_PRIO(a, b) do { \ int diff = (int)(b)->ki_pri.pri_level - (int)(a)->ki_pri.pri_level; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_THREADS(a, b) do { \ int diff = (int)(b)->ki_numthreads - (int)(a)->ki_numthreads; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_RSSIZE(a, b) do { \ long diff = (long)(b)->ki_rssize - (long)(a)->ki_rssize; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_MEM(a, b) do { \ long diff = (long)PROCSIZE((b)) - (long)PROCSIZE((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_JID(a, b) do { \ int diff = (int)(b)->ki_jid - (int)(a)->ki_jid; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_SWAP(a, b) do { \ int diff = (int)ki_swap(b) - (int)ki_swap(a); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) /* compare_cpu - the comparison function for sorting by cpu percentage */ int #ifdef ORDER compare_cpu(void *arg1, void *arg2) #else proc_compare(void *arg1, void *arg2) #endif { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } #ifdef ORDER /* "cpu" compare routines */ int compare_size(), compare_res(), compare_time(), compare_prio(), compare_threads(); /* * "io" compare routines. Context switches aren't i/o, but are displayed * on the "io" display. */ int compare_iototal(), compare_ioread(), compare_iowrite(), compare_iofault(), compare_vcsw(), compare_ivcsw(); int (*compares[])() = { compare_cpu, compare_size, compare_res, compare_time, compare_prio, compare_threads, compare_iototal, compare_ioread, compare_iowrite, compare_iofault, compare_vcsw, compare_ivcsw, compare_jid, compare_swap, NULL }; /* compare_size - the comparison function for sorting by total memory usage */ int compare_size(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_MEM(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_res - the comparison function for sorting by resident set size */ int compare_res(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_time - the comparison function for sorting by total cpu time */ int compare_time(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_prio - the comparison function for sorting by priority */ int compare_prio(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_PRIO(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_threads - the comparison function for sorting by threads */ int compare_threads(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_THREADS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_jid - the comparison function for sorting by jid */ static int compare_jid(const void *arg1, const void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_JID(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_swap - the comparison function for sorting by swap */ static int compare_swap(const void *arg1, const void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_SWAP(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } #endif /* ORDER */ /* assorted comparison functions for sorting by i/o */ int #ifdef ORDER compare_iototal(void *arg1, void *arg2) #else io_compare(void *arg1, void *arg2) #endif { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; return (get_io_total(p2) - get_io_total(p1)); } #ifdef ORDER int compare_ioread(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, inp1, inp2; (void) get_io_stats(p1, &inp1, &dummy, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &inp2, &dummy, &dummy, &dummy, &dummy); return (inp2 - inp1); } int compare_iowrite(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, oup1, oup2; (void) get_io_stats(p1, &dummy, &oup1, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &oup2, &dummy, &dummy, &dummy); return (oup2 - oup1); } int compare_iofault(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &flp1, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &flp2, &dummy, &dummy); return (flp2 - flp1); } int compare_vcsw(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &flp1, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &flp2, &dummy); return (flp2 - flp1); } int compare_ivcsw(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &dummy, &flp1); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &dummy, &flp2); return (flp2 - flp1); } #endif /* ORDER */ /* * proc_owner(pid) - returns the uid that owns process "pid", or -1 if * the process does not exist. * It is EXTREMELY IMPORTANT that this function work correctly. * If top runs setuid root (as in SVR4), then this function * is the only thing that stands in the way of a serious * security problem. It validates requests for the "kill" * and "renice" commands. */ int proc_owner(int pid) { int cnt; struct kinfo_proc **prefp; struct kinfo_proc *pp; prefp = pref; cnt = pref_len; while (--cnt >= 0) { pp = *prefp++; if (pp->ki_pid == (pid_t)pid) return ((int)pp->ki_ruid); } return (-1); } static int swapmode(int *retavail, int *retfree) { int n; int pagesize = getpagesize(); struct kvm_swap swapary[1]; *retavail = 0; *retfree = 0; #define CONVERT(v) ((quad_t)(v) * pagesize / 1024) n = kvm_getswapinfo(kd, swapary, 1, 0); if (n < 0 || swapary[0].ksw_total == 0) return (0); *retavail = CONVERT(swapary[0].ksw_total); *retfree = CONVERT(swapary[0].ksw_total - swapary[0].ksw_used); n = (int)(swapary[0].ksw_used * 100.0 / swapary[0].ksw_total); return (n); } Index: head/usr.bin/vmstat/vmstat.c =================================================================== --- head/usr.bin/vmstat/vmstat.c (revision 317060) +++ head/usr.bin/vmstat/vmstat.c (revision 317061) @@ -1,1725 +1,1733 @@ /* * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1986, 1991, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)vmstat.c 8.1 (Berkeley) 6/6/93"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#define _WANT_VMMETER #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VMSTAT_XO_VERSION "1" static char da[] = "da"; static struct nlist namelist[] = { #define X_SUM 0 { "_vm_cnt" }, #define X_HZ 1 { "_hz" }, #define X_STATHZ 2 { "_stathz" }, #define X_NCHSTATS 3 { "_nchstats" }, #define X_INTRNAMES 4 { "_intrnames" }, #define X_SINTRNAMES 5 { "_sintrnames" }, #define X_INTRCNT 6 { "_intrcnt" }, #define X_SINTRCNT 7 { "_sintrcnt" }, #ifdef notyet #define X_DEFICIT XXX { "_deficit" }, #define X_REC XXX { "_rectime" }, #define X_PGIN XXX { "_pgintime" }, #define X_XSTATS XXX { "_xstats" }, #define X_END XXX #else #define X_END 8 #endif { "" }, }; static struct statinfo cur, last; static int num_devices, maxshowdevs; static long generation; static struct device_selection *dev_select; static int num_selected; static struct devstat_match *matches; static int num_matches = 0; static int num_devices_specified, num_selections; static long select_generation; static char **specified_devices; static devstat_select_mode select_mode; -static struct vmmeter sum, osum; +static struct __vmmeter { + uint64_t v_swtch; + uint64_t v_trap; + uint64_t v_syscall; + uint64_t v_intr; + uint64_t v_soft; + uint64_t v_vm_faults; + uint64_t v_io_faults; + uint64_t v_cow_faults; + uint64_t v_cow_optim; + uint64_t v_zfod; + uint64_t v_ozfod; + uint64_t v_swapin; + uint64_t v_swapout; + uint64_t v_swappgsin; + uint64_t v_swappgsout; + uint64_t v_vnodein; + uint64_t v_vnodeout; + uint64_t v_vnodepgsin; + uint64_t v_vnodepgsout; + uint64_t v_intrans; + uint64_t v_reactivated; + uint64_t v_pdwakeups; + uint64_t v_pdpages; + uint64_t v_pdshortfalls; + uint64_t v_dfree; + uint64_t v_pfree; + uint64_t v_tfree; + uint64_t v_forks; + uint64_t v_vforks; + uint64_t v_rforks; + uint64_t v_kthreads; + uint64_t v_forkpages; + uint64_t v_vforkpages; + uint64_t v_rforkpages; + uint64_t v_kthreadpages; + u_int v_page_size; + u_int v_page_count; + u_int v_free_reserved; + u_int v_free_target; + u_int v_free_min; + u_int v_free_count; + u_int v_wire_count; + u_int v_active_count; + u_int v_inactive_target; + u_int v_inactive_count; + u_int v_laundry_count; + u_int v_pageout_free_min; + u_int v_interrupt_free_min; + u_int v_free_severe; +} sum, osum; #define VMSTAT_DEFAULT_LINES 20 /* Default number of `winlines'. */ volatile sig_atomic_t wresized; /* Tty resized, when non-zero. */ static int winlines = VMSTAT_DEFAULT_LINES; /* Current number of tty rows. */ static int aflag; static int nflag; static int Pflag; static int hflag; static kvm_t *kd; #define FORKSTAT 0x01 #define INTRSTAT 0x02 #define MEMSTAT 0x04 #define SUMSTAT 0x08 #define TIMESTAT 0x10 #define VMSTAT 0x20 #define ZMEMSTAT 0x40 #define OBJSTAT 0x80 static void cpustats(void); static void pcpustats(int, u_long, int); static void devstats(void); static void doforkst(void); static void dointr(unsigned int, int); static void doobjstat(void); static void dosum(void); static void dovmstat(unsigned int, int); static void domemstat_malloc(void); static void domemstat_zone(void); static void kread(int, void *, size_t); static void kreado(int, void *, size_t, size_t); static char *kgetstr(const char *); static void needhdr(int); static void needresize(int); static void doresize(void); static void printhdr(int, u_long); static void usage(void); static long pct(long, long); static long long getuptime(void); static char **getdrivedata(char **); int main(int argc, char *argv[]) { int c, todo; unsigned int interval; float f; int reps; char *memf, *nlistf; char errbuf[_POSIX2_LINE_MAX]; memf = nlistf = NULL; interval = reps = todo = 0; maxshowdevs = 2; hflag = isatty(1); argc = xo_parse_args(argc, argv); if (argc < 0) return argc; while ((c = getopt(argc, argv, "ac:fhHiM:mN:n:oPp:stw:z")) != -1) { switch (c) { case 'a': aflag++; break; case 'c': reps = atoi(optarg); break; case 'P': Pflag++; break; case 'f': todo |= FORKSTAT; break; case 'h': hflag = 1; break; case 'H': hflag = 0; break; case 'i': todo |= INTRSTAT; break; case 'M': memf = optarg; break; case 'm': todo |= MEMSTAT; break; case 'N': nlistf = optarg; break; case 'n': nflag = 1; maxshowdevs = atoi(optarg); if (maxshowdevs < 0) xo_errx(1, "number of devices %d is < 0", maxshowdevs); break; case 'o': todo |= OBJSTAT; break; case 'p': if (devstat_buildmatch(optarg, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); break; case 's': todo |= SUMSTAT; break; case 't': #ifdef notyet todo |= TIMESTAT; #else xo_errx(EX_USAGE, "sorry, -t is not (re)implemented yet"); #endif break; case 'w': /* Convert to milliseconds. */ f = atof(optarg); interval = f * 1000; break; case 'z': todo |= ZMEMSTAT; break; case '?': default: usage(); } } argc -= optind; argv += optind; xo_set_version(VMSTAT_XO_VERSION); if (todo == 0) todo = VMSTAT; if (memf != NULL) { kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf); if (kd == NULL) xo_errx(1, "kvm_openfiles: %s", errbuf); } retry_nlist: if (kd != NULL && (c = kvm_nlist(kd, namelist)) != 0) { if (c > 0) { int bufsize = 0, len = 0; char *buf, *bp; /* * 'cnt' was renamed to 'vm_cnt'. If 'vm_cnt' is not * found try looking up older 'cnt' symbol. * */ if (namelist[X_SUM].n_type == 0 && strcmp(namelist[X_SUM].n_name, "_vm_cnt") == 0) { namelist[X_SUM].n_name = "_cnt"; goto retry_nlist; } for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) bufsize += strlen(namelist[c].n_name) + 1; bufsize += len + 1; buf = bp = alloca(bufsize); for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) { xo_error(" %s", namelist[c].n_name); len = strlen(namelist[c].n_name); *bp++ = ' '; memcpy(bp, namelist[c].n_name, len); bp += len; } *bp = '\0'; xo_error("undefined symbols:\n", buf); } else xo_warnx("kvm_nlist: %s", kvm_geterr(kd)); xo_finish(); exit(1); } if (kd && Pflag) xo_errx(1, "Cannot use -P with crash dumps"); if (todo & VMSTAT) { /* * Make sure that the userland devstat version matches the * kernel devstat version. If not, exit and print a * message informing the user of his mistake. */ if (devstat_checkversion(NULL) < 0) xo_errx(1, "%s", devstat_errbuf); argv = getdrivedata(argv); } if (*argv) { f = atof(*argv); interval = f * 1000; if (*++argv) reps = atoi(*argv); } if (interval) { if (!reps) reps = -1; } else if (reps) interval = 1 * 1000; if (todo & FORKSTAT) doforkst(); if (todo & MEMSTAT) domemstat_malloc(); if (todo & ZMEMSTAT) domemstat_zone(); if (todo & SUMSTAT) dosum(); if (todo & OBJSTAT) doobjstat(); #ifdef notyet if (todo & TIMESTAT) dotimes(); #endif if (todo & INTRSTAT) dointr(interval, reps); if (todo & VMSTAT) dovmstat(interval, reps); xo_finish(); exit(0); } static int mysysctl(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; error = sysctlbyname(name, oldp, oldlenp, newp, newlen); if (error != 0 && errno != ENOMEM) xo_err(1, "sysctl(%s)", name); return (error); } static char ** getdrivedata(char **argv) { if ((num_devices = devstat_getnumdevs(NULL)) < 0) xo_errx(1, "%s", devstat_errbuf); cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); if (devstat_getdevs(NULL, &cur) == -1) xo_errx(1, "%s", devstat_errbuf); num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; specified_devices = (char **)malloc(sizeof(char *)); for (num_devices_specified = 0; *argv; ++argv) { if (isdigit(**argv)) break; num_devices_specified++; specified_devices = (char **)realloc(specified_devices, sizeof(char *) * num_devices_specified); specified_devices[num_devices_specified - 1] = *argv; } dev_select = NULL; if (nflag == 0 && maxshowdevs < num_devices_specified) maxshowdevs = num_devices_specified; /* * People are generally only interested in disk statistics when * they're running vmstat. So, that's what we're going to give * them if they don't specify anything by default. We'll also give * them any other random devices in the system so that we get to * maxshowdevs devices, if that many devices exist. If the user * specifies devices on the command line, either through a pattern * match or by naming them explicitly, we will give the user only * those devices. */ if ((num_devices_specified == 0) && (num_matches == 0)) { if (devstat_buildmatch(da, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); select_mode = DS_SELECT_ADD; } else select_mode = DS_SELECT_ONLY; /* * At this point, selectdevs will almost surely indicate that the * device list has changed, so we don't look for return values of 0 * or 1. If we get back -1, though, there is an error. */ if (devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0) == -1) xo_errx(1, "%s", devstat_errbuf); return(argv); } /* Return system uptime in nanoseconds */ static long long getuptime(void) { struct timespec sp; (void)clock_gettime(CLOCK_UPTIME, &sp); return((long long)sp.tv_sec * 1000000000LL + sp.tv_nsec); } static void -fill_pcpu(struct pcpu ***pcpup, int* maxcpup) +fill_vmmeter(struct __vmmeter *vmmp) { struct pcpu **pcpu; - int maxcpu, i; - *pcpup = NULL; - - if (kd == NULL) - return; - - maxcpu = kvm_getmaxcpu(kd); - if (maxcpu < 0) - xo_errx(1, "kvm_getmaxcpu: %s", kvm_geterr(kd)); - - pcpu = calloc(maxcpu, sizeof(struct pcpu *)); - if (pcpu == NULL) - xo_err(1, "calloc"); - - for (i = 0; i < maxcpu; i++) { - pcpu[i] = kvm_getpcpu(kd, i); - if (pcpu[i] == (struct pcpu *)-1) - xo_errx(1, "kvm_getpcpu: %s", kvm_geterr(kd)); - } - - *maxcpup = maxcpu; - *pcpup = pcpu; -} - -static void -free_pcpu(struct pcpu **pcpu, int maxcpu) -{ - int i; - - for (i = 0; i < maxcpu; i++) - free(pcpu[i]); - free(pcpu); -} - -static void -fill_vmmeter(struct vmmeter *vmmp) -{ - struct pcpu **pcpu; - int maxcpu, i; - if (kd != NULL) { - kread(X_SUM, vmmp, sizeof(*vmmp)); - fill_pcpu(&pcpu, &maxcpu); - for (i = 0; i < maxcpu; i++) { - if (pcpu[i] == NULL) - continue; -#define ADD_FROM_PCPU(i, name) \ - vmmp->name += pcpu[i]->pc_cnt.name - ADD_FROM_PCPU(i, v_swtch); - ADD_FROM_PCPU(i, v_trap); - ADD_FROM_PCPU(i, v_syscall); - ADD_FROM_PCPU(i, v_intr); - ADD_FROM_PCPU(i, v_soft); - ADD_FROM_PCPU(i, v_vm_faults); - ADD_FROM_PCPU(i, v_io_faults); - ADD_FROM_PCPU(i, v_cow_faults); - ADD_FROM_PCPU(i, v_cow_optim); - ADD_FROM_PCPU(i, v_zfod); - ADD_FROM_PCPU(i, v_ozfod); - ADD_FROM_PCPU(i, v_swapin); - ADD_FROM_PCPU(i, v_swapout); - ADD_FROM_PCPU(i, v_swappgsin); - ADD_FROM_PCPU(i, v_swappgsout); - ADD_FROM_PCPU(i, v_vnodein); - ADD_FROM_PCPU(i, v_vnodeout); - ADD_FROM_PCPU(i, v_vnodepgsin); - ADD_FROM_PCPU(i, v_vnodepgsout); - ADD_FROM_PCPU(i, v_intrans); - ADD_FROM_PCPU(i, v_tfree); - ADD_FROM_PCPU(i, v_forks); - ADD_FROM_PCPU(i, v_vforks); - ADD_FROM_PCPU(i, v_rforks); - ADD_FROM_PCPU(i, v_kthreads); - ADD_FROM_PCPU(i, v_forkpages); - ADD_FROM_PCPU(i, v_vforkpages); - ADD_FROM_PCPU(i, v_rforkpages); - ADD_FROM_PCPU(i, v_kthreadpages); -#undef ADD_FROM_PCPU - } - free_pcpu(pcpu, maxcpu); + struct vmmeter vm_cnt; + + kread(X_SUM, &vm_cnt, sizeof(vm_cnt)); +#define GET_COUNTER(name) \ + vmmp->name = kvm_counter_u64_fetch(kd, (u_long)vm_cnt.name) + GET_COUNTER(v_swtch); + GET_COUNTER(v_trap); + GET_COUNTER(v_syscall); + GET_COUNTER(v_intr); + GET_COUNTER(v_soft); + GET_COUNTER(v_vm_faults); + GET_COUNTER(v_io_faults); + GET_COUNTER(v_cow_faults); + GET_COUNTER(v_cow_optim); + GET_COUNTER(v_zfod); + GET_COUNTER(v_ozfod); + GET_COUNTER(v_swapin); + GET_COUNTER(v_swapout); + GET_COUNTER(v_swappgsin); + GET_COUNTER(v_swappgsout); + GET_COUNTER(v_vnodein); + GET_COUNTER(v_vnodeout); + GET_COUNTER(v_vnodepgsin); + GET_COUNTER(v_vnodepgsout); + GET_COUNTER(v_intrans); + GET_COUNTER(v_tfree); + GET_COUNTER(v_forks); + GET_COUNTER(v_vforks); + GET_COUNTER(v_rforks); + GET_COUNTER(v_kthreads); + GET_COUNTER(v_forkpages); + GET_COUNTER(v_vforkpages); + GET_COUNTER(v_rforkpages); + GET_COUNTER(v_kthreadpages); +#undef GET_COUNTER } else { - size_t size = sizeof(unsigned int); + size_t size = sizeof(uint64_t); + #define GET_VM_STATS(cat, name) \ mysysctl("vm.stats." #cat "." #name, &vmmp->name, &size, NULL, 0) /* sys */ GET_VM_STATS(sys, v_swtch); GET_VM_STATS(sys, v_trap); GET_VM_STATS(sys, v_syscall); GET_VM_STATS(sys, v_intr); GET_VM_STATS(sys, v_soft); /* vm */ GET_VM_STATS(vm, v_vm_faults); GET_VM_STATS(vm, v_io_faults); GET_VM_STATS(vm, v_cow_faults); GET_VM_STATS(vm, v_cow_optim); GET_VM_STATS(vm, v_zfod); GET_VM_STATS(vm, v_ozfod); GET_VM_STATS(vm, v_swapin); GET_VM_STATS(vm, v_swapout); GET_VM_STATS(vm, v_swappgsin); GET_VM_STATS(vm, v_swappgsout); GET_VM_STATS(vm, v_vnodein); GET_VM_STATS(vm, v_vnodeout); GET_VM_STATS(vm, v_vnodepgsin); GET_VM_STATS(vm, v_vnodepgsout); GET_VM_STATS(vm, v_intrans); GET_VM_STATS(vm, v_reactivated); GET_VM_STATS(vm, v_pdwakeups); GET_VM_STATS(vm, v_pdpages); GET_VM_STATS(vm, v_pdshortfalls); GET_VM_STATS(vm, v_dfree); GET_VM_STATS(vm, v_pfree); GET_VM_STATS(vm, v_tfree); GET_VM_STATS(vm, v_page_size); GET_VM_STATS(vm, v_page_count); GET_VM_STATS(vm, v_free_reserved); GET_VM_STATS(vm, v_free_target); GET_VM_STATS(vm, v_free_min); GET_VM_STATS(vm, v_free_count); GET_VM_STATS(vm, v_wire_count); GET_VM_STATS(vm, v_active_count); GET_VM_STATS(vm, v_inactive_target); GET_VM_STATS(vm, v_inactive_count); GET_VM_STATS(vm, v_laundry_count); GET_VM_STATS(vm, v_pageout_free_min); GET_VM_STATS(vm, v_interrupt_free_min); /*GET_VM_STATS(vm, v_free_severe);*/ GET_VM_STATS(vm, v_forks); GET_VM_STATS(vm, v_vforks); GET_VM_STATS(vm, v_rforks); GET_VM_STATS(vm, v_kthreads); GET_VM_STATS(vm, v_forkpages); GET_VM_STATS(vm, v_vforkpages); GET_VM_STATS(vm, v_rforkpages); GET_VM_STATS(vm, v_kthreadpages); #undef GET_VM_STATS } } static void fill_vmtotal(struct vmtotal *vmtp) { if (kd != NULL) { /* XXX fill vmtp */ xo_errx(1, "not implemented"); } else { size_t size = sizeof(*vmtp); mysysctl("vm.vmtotal", vmtp, &size, NULL, 0); if (size != sizeof(*vmtp)) xo_errx(1, "vm.total size mismatch"); } } /* Determine how many cpu columns, and what index they are in kern.cp_times */ static int getcpuinfo(u_long *maskp, int *maxidp) { int maxcpu; int maxid; int ncpus; int i, j; int empty; size_t size; long *times; u_long mask; if (kd != NULL) xo_errx(1, "not implemented"); mask = 0; ncpus = 0; size = sizeof(maxcpu); mysysctl("kern.smp.maxcpus", &maxcpu, &size, NULL, 0); if (size != sizeof(maxcpu)) xo_errx(1, "sysctl kern.smp.maxcpus"); size = sizeof(long) * maxcpu * CPUSTATES; times = malloc(size); if (times == NULL) xo_err(1, "malloc %zd bytes", size); mysysctl("kern.cp_times", times, &size, NULL, 0); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { mask |= (1ul << i); ncpus++; } } if (maskp) *maskp = mask; if (maxidp) *maxidp = maxid; return (ncpus); } static void prthuman(const char *name, u_int64_t val, int size) { char buf[10]; int flags; char fmt[128]; snprintf(fmt, sizeof(fmt), "{:%s/%%*s}", name); if (size < 5 || size > 9) xo_errx(1, "doofus"); flags = HN_B | HN_NOSPACE | HN_DECIMAL; humanize_number(buf, size, val, "", HN_AUTOSCALE, flags); xo_attr("value", "%ju", (uintmax_t) val); xo_emit(fmt, size, buf); } static int hz, hdrcnt; static long *cur_cp_times; static long *last_cp_times; static size_t size_cp_times; static void dovmstat(unsigned int interval, int reps) { struct vmtotal total; time_t uptime, halfuptime; struct devinfo *tmp_dinfo; size_t size; int ncpus, maxid; u_long cpumask; int rate_adj; uptime = getuptime() / 1000000000LL; halfuptime = uptime / 2; rate_adj = 1; ncpus = 1; maxid = 0; /* * If the user stops the program (control-Z) and then resumes it, * print out the header again. */ (void)signal(SIGCONT, needhdr); /* * If our standard output is a tty, then install a SIGWINCH handler * and set wresized so that our first iteration through the main * vmstat loop will peek at the terminal's current rows to find out * how many lines can fit in a screenful of output. */ if (isatty(fileno(stdout)) != 0) { wresized = 1; (void)signal(SIGWINCH, needresize); } else { wresized = 0; winlines = VMSTAT_DEFAULT_LINES; } if (kd != NULL) { if (namelist[X_STATHZ].n_type != 0 && namelist[X_STATHZ].n_value != 0) kread(X_STATHZ, &hz, sizeof(hz)); if (!hz) kread(X_HZ, &hz, sizeof(hz)); } else { struct clockinfo clockrate; size = sizeof(clockrate); mysysctl("kern.clockrate", &clockrate, &size, NULL, 0); if (size != sizeof(clockrate)) xo_errx(1, "clockrate size mismatch"); hz = clockrate.hz; } if (Pflag) { ncpus = getcpuinfo(&cpumask, &maxid); size_cp_times = sizeof(long) * (maxid + 1) * CPUSTATES; cur_cp_times = calloc(1, size_cp_times); last_cp_times = calloc(1, size_cp_times); } for (hdrcnt = 1;;) { if (!--hdrcnt) printhdr(maxid, cpumask); if (kd != NULL) { if (kvm_getcptime(kd, cur.cp_time) < 0) xo_errx(1, "kvm_getcptime: %s", kvm_geterr(kd)); } else { size = sizeof(cur.cp_time); mysysctl("kern.cp_time", &cur.cp_time, &size, NULL, 0); if (size != sizeof(cur.cp_time)) xo_errx(1, "cp_time size mismatch"); } if (Pflag) { size = size_cp_times; mysysctl("kern.cp_times", cur_cp_times, &size, NULL, 0); if (size != size_cp_times) xo_errx(1, "cp_times mismatch"); } tmp_dinfo = last.dinfo; last.dinfo = cur.dinfo; cur.dinfo = tmp_dinfo; last.snap_time = cur.snap_time; /* * Here what we want to do is refresh our device stats. * getdevs() returns 1 when the device list has changed. * If the device list has changed, we want to go through * the selection process again, in case a device that we * were previously displaying has gone away. */ switch (devstat_getdevs(NULL, &cur)) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: { int retval; num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; retval = devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0); switch (retval) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: printhdr(maxid, cpumask); break; default: break; } } default: break; } fill_vmmeter(&sum); fill_vmtotal(&total); xo_open_container("processes"); xo_emit("{:runnable/%1d} {:waiting/%ld} " "{:swapped-out/%ld}", total.t_rq - 1, total.t_dw + total.t_pw, total.t_sw); xo_close_container("processes"); xo_open_container("memory"); #define vmstat_pgtok(a) ((a) * (sum.v_page_size >> 10)) #define rate(x) (((x) * rate_adj + halfuptime) / uptime) /* round */ if (hflag) { xo_emit(""); prthuman("available-memory", total.t_avm * (u_int64_t)sum.v_page_size, 5); xo_emit(" "); prthuman("free-memory", total.t_free * (u_int64_t)sum.v_page_size, 5); xo_emit(" "); } else { xo_emit(" "); xo_emit("{:available-memory/%7d}", vmstat_pgtok(total.t_avm)); xo_emit(" "); xo_emit("{:free-memory/%7d}", vmstat_pgtok(total.t_free)); } xo_emit("{:total-page-faults/%5lu} ", (unsigned long)rate(sum.v_vm_faults - osum.v_vm_faults)); xo_close_container("memory"); xo_open_container("paging-rates"); xo_emit("{:page-reactivated/%3lu} ", (unsigned long)rate(sum.v_reactivated - osum.v_reactivated)); xo_emit("{:paged-in/%3lu} ", (unsigned long)rate(sum.v_swapin + sum.v_vnodein - (osum.v_swapin + osum.v_vnodein))); xo_emit("{:paged-out/%3lu} ", (unsigned long)rate(sum.v_swapout + sum.v_vnodeout - (osum.v_swapout + osum.v_vnodeout))); xo_emit("{:freed/%5lu} ", (unsigned long)rate(sum.v_tfree - osum.v_tfree)); xo_emit("{:scanned/%4lu} ", (unsigned long)rate(sum.v_pdpages - osum.v_pdpages)); xo_close_container("paging-rates"); devstats(); xo_open_container("fault-rates"); xo_emit("{:interrupts/%4lu} {:system-calls/%5lu} " "{:context-switches/%5u}", (unsigned long)rate(sum.v_intr - osum.v_intr), (unsigned long)rate(sum.v_syscall - osum.v_syscall), (unsigned long)rate(sum.v_swtch - osum.v_swtch)); xo_close_container("fault-rates"); if (Pflag) pcpustats(ncpus, cpumask, maxid); else cpustats(); xo_emit("\n"); xo_flush(); if (reps >= 0 && --reps <= 0) break; osum = sum; uptime = interval; rate_adj = 1000; /* * We round upward to avoid losing low-frequency events * (i.e., >= 1 per interval but < 1 per millisecond). */ if (interval != 1) halfuptime = (uptime + 1) / 2; else halfuptime = 0; (void)usleep(interval * 1000); } } static void printhdr(int maxid, u_long cpumask) { int i, num_shown; num_shown = MIN(num_selected, maxshowdevs); if (hflag) { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } else { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } if (num_shown > 1) xo_emit(" {T:/disks %*s}", num_shown * 4 - 7, ""); else if (num_shown == 1) xo_emit(" {T:disks}"); xo_emit(" {T:faults} "); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:/cpu%d} ", i); } xo_emit("\n"); } else xo_emit(" {T:cpu}\n"); if (hflag) { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } else { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } for (i = 0; i < num_devices; i++) if ((dev_select[i].selected) && (dev_select[i].selected <= maxshowdevs)) xo_emit("{T:/%c%c%d} ", dev_select[i].device_name[0], dev_select[i].device_name[1], dev_select[i].unit_number); xo_emit(" {T:in} {T:sy} {T:cs}"); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:us} {T:sy} {T:id}"); } xo_emit("\n"); } else xo_emit(" {T:us} {T:sy} {T:id}\n"); if (wresized != 0) doresize(); hdrcnt = winlines; } /* * Force a header to be prepended to the next output. */ static void needhdr(int dummy __unused) { hdrcnt = 1; } /* * When the terminal is resized, force an update of the maximum number of rows * printed between each header repetition. Then force a new header to be * prepended to the next output. */ void needresize(int signo) { wresized = 1; hdrcnt = 1; } /* * Update the global `winlines' count of terminal rows. */ void doresize(void) { int status; struct winsize w; for (;;) { status = ioctl(fileno(stdout), TIOCGWINSZ, &w); if (status == -1 && errno == EINTR) continue; else if (status == -1) xo_err(1, "ioctl"); if (w.ws_row > 3) winlines = w.ws_row - 3; else winlines = VMSTAT_DEFAULT_LINES; break; } /* * Inhibit doresize() calls until we are rescheduled by SIGWINCH. */ wresized = 0; } #ifdef notyet static void dotimes(void) { unsigned int pgintime, rectime; kread(X_REC, &rectime, sizeof(rectime)); kread(X_PGIN, &pgintime, sizeof(pgintime)); kread(X_SUM, &sum, sizeof(sum)); xo_emit("{:page-reclaims/%u} {N:reclaims}, " "{:reclaim-time/%u} {N:total time (usec)}\n", sum.v_pgrec, rectime); xo_emit("{L:average}: {:reclaim-average/%u} {N:usec \\/ reclaim}\n", rectime / sum.v_pgrec); xo_emit("\n"); xo_emit("{:page-ins/%u} {N:page ins}, " "{:page-in-time/%u} {N:total time (msec)}\n", sum.v_pgin, pgintime / 10); xo_emit("{L:average}: {:average/%8.1f} {N:msec \\/ page in}\n", pgintime / (sum.v_pgin * 10.0)); } #endif static long pct(long top, long bot) { long ans; if (bot == 0) return(0); ans = (quad_t)top * 100 / bot; return (ans); } #define PCT(top, bot) pct((long)(top), (long)(bot)) static void dosum(void) { struct nchstats lnchstats; long nchtotal; fill_vmmeter(&sum); xo_open_container("summary-statistics"); xo_emit("{:context-switches/%9u} {N:cpu context switches}\n", sum.v_swtch); xo_emit("{:interrupts/%9u} {N:device interrupts}\n", sum.v_intr); xo_emit("{:software-interrupts/%9u} {N:software interrupts}\n", sum.v_soft); xo_emit("{:traps/%9u} {N:traps}\n", sum.v_trap); xo_emit("{:system-calls/%9u} {N:system calls}\n", sum.v_syscall); xo_emit("{:kernel-threads/%9u} {N:kernel threads created}\n", sum.v_kthreads); xo_emit("{:forks/%9u} {N: fork() calls}\n", sum.v_forks); xo_emit("{:vforks/%9u} {N:vfork() calls}\n", sum.v_vforks); xo_emit("{:rforks/%9u} {N:rfork() calls}\n", sum.v_rforks); xo_emit("{:swap-ins/%9u} {N:swap pager pageins}\n", sum.v_swapin); xo_emit("{:swap-in-pages/%9u} {N:swap pager pages paged in}\n", sum.v_swappgsin); xo_emit("{:swap-outs/%9u} {N:swap pager pageouts}\n", sum.v_swapout); xo_emit("{:swap-out-pages/%9u} {N:swap pager pages paged out}\n", sum.v_swappgsout); xo_emit("{:vnode-page-ins/%9u} {N:vnode pager pageins}\n", sum.v_vnodein); xo_emit("{:vnode-page-in-pages/%9u} {N:vnode pager pages paged in}\n", sum.v_vnodepgsin); xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pageouts}\n", sum.v_vnodeout); xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pages paged out}\n", sum.v_vnodepgsout); xo_emit("{:page-daemon-wakeups/%9u} {N:page daemon wakeups}\n", sum.v_pdwakeups); xo_emit("{:page-daemon-pages/%9u} {N:pages examined by the page daemon}\n", sum.v_pdpages); xo_emit("{:page-reclamation-shortfalls/%9u} {N:clean page reclamation shortfalls}\n", sum.v_pdshortfalls); xo_emit("{:reactivated/%9u} {N:pages reactivated by the page daemon}\n", sum.v_reactivated); xo_emit("{:copy-on-write-faults/%9u} {N:copy-on-write faults}\n", sum.v_cow_faults); xo_emit("{:copy-on-write-optimized-faults/%9u} {N:copy-on-write optimized faults}\n", sum.v_cow_optim); xo_emit("{:zero-fill-pages/%9u} {N:zero fill pages zeroed}\n", sum.v_zfod); xo_emit("{:zero-fill-prezeroed/%9u} {N:zero fill pages prezeroed}\n", sum.v_ozfod); xo_emit("{:intransit-blocking/%9u} {N:intransit blocking page faults}\n", sum.v_intrans); xo_emit("{:total-faults/%9u} {N:total VM faults taken}\n", sum.v_vm_faults); xo_emit("{:faults-requiring-io/%9u} {N:page faults requiring I\\/O}\n", sum.v_io_faults); xo_emit("{:faults-from-thread-creation/%9u} {N:pages affected by kernel thread creation}\n", sum.v_kthreadpages); xo_emit("{:faults-from-fork/%9u} {N:pages affected by fork}()\n", sum.v_forkpages); xo_emit("{:faults-from-vfork/%9u} {N:pages affected by vfork}()\n", sum.v_vforkpages); xo_emit("{:pages-rfork/%9u} {N:pages affected by rfork}()\n", sum.v_rforkpages); xo_emit("{:pages-freed/%9u} {N:pages freed}\n", sum.v_tfree); xo_emit("{:pages-freed-by-daemon/%9u} {N:pages freed by daemon}\n", sum.v_dfree); xo_emit("{:pages-freed-on-exit/%9u} {N:pages freed by exiting processes}\n", sum.v_pfree); xo_emit("{:active-pages/%9u} {N:pages active}\n", sum.v_active_count); xo_emit("{:inactive-pages/%9u} {N:pages inactive}\n", sum.v_inactive_count); xo_emit("{:laundry-pages/%9u} {N:pages in the laundry queue}\n", sum.v_laundry_count); xo_emit("{:wired-pages/%9u} {N:pages wired down}\n", sum.v_wire_count); xo_emit("{:free-pages/%9u} {N:pages free}\n", sum.v_free_count); xo_emit("{:bytes-per-page/%9u} {N:bytes per page}\n", sum.v_page_size); if (kd != NULL) { kread(X_NCHSTATS, &lnchstats, sizeof(lnchstats)); } else { size_t size = sizeof(lnchstats); mysysctl("vfs.cache.nchstats", &lnchstats, &size, NULL, 0); if (size != sizeof(lnchstats)) xo_errx(1, "vfs.cache.nchstats size mismatch"); } nchtotal = lnchstats.ncs_goodhits + lnchstats.ncs_neghits + lnchstats.ncs_badhits + lnchstats.ncs_falsehits + lnchstats.ncs_miss + lnchstats.ncs_long; xo_emit("{:total-name-lookups/%9ld} {N:total name lookups}\n", nchtotal); xo_emit("{P:/%9s} {N:cache hits} " "({:positive-cache-hits/%ld}% pos + " "{:negative-cache-hits/%ld}% {N:neg}) " "system {:cache-hit-percent/%ld}% per-directory\n", "", PCT(lnchstats.ncs_goodhits, nchtotal), PCT(lnchstats.ncs_neghits, nchtotal), PCT(lnchstats.ncs_pass2, nchtotal)); xo_emit("{P:/%9s} {L:deletions} {:deletions/%ld}%, " "{L:falsehits} {:false-hits/%ld}%, " "{L:toolong} {:too-long/%ld}%\n", "", PCT(lnchstats.ncs_badhits, nchtotal), PCT(lnchstats.ncs_falsehits, nchtotal), PCT(lnchstats.ncs_long, nchtotal)); xo_close_container("summary-statistics"); } static void doforkst(void) { fill_vmmeter(&sum); xo_open_container("fork-statistics"); xo_emit("{:fork/%u} {N:forks}, {:fork-pages/%u} {N:pages}, " "{L:average} {:fork-average/%.2f}\n", sum.v_forks, sum.v_forkpages, sum.v_forks == 0 ? 0.0 : (double)sum.v_forkpages / sum.v_forks); xo_emit("{:vfork/%u} {N:vforks}, {:vfork-pages/%u} {N:pages}, " "{L:average} {:vfork-average/%.2f}\n", sum.v_vforks, sum.v_vforkpages, sum.v_vforks == 0 ? 0.0 : (double)sum.v_vforkpages / sum.v_vforks); xo_emit("{:rfork/%u} {N:rforks}, {:rfork-pages/%u} {N:pages}, " "{L:average} {:rfork-average/%.2f}\n", sum.v_rforks, sum.v_rforkpages, sum.v_rforks == 0 ? 0.0 : (double)sum.v_rforkpages / sum.v_rforks); xo_close_container("fork-statistics"); } static void devstats(void) { int dn, state; long double transfers_per_second; long double busy_seconds; long tmp; for (state = 0; state < CPUSTATES; ++state) { tmp = cur.cp_time[state]; cur.cp_time[state] -= last.cp_time[state]; last.cp_time[state] = tmp; } busy_seconds = cur.snap_time - last.snap_time; xo_open_list("device"); for (dn = 0; dn < num_devices; dn++) { int di; if ((dev_select[dn].selected == 0) || (dev_select[dn].selected > maxshowdevs)) continue; di = dev_select[dn].position; if (devstat_compute_statistics(&cur.dinfo->devices[di], &last.dinfo->devices[di], busy_seconds, DSM_TRANSFERS_PER_SECOND, &transfers_per_second, DSM_NONE) != 0) xo_errx(1, "%s", devstat_errbuf); xo_open_instance("device"); xo_emit("{ekq:name/%c%c%d}{:transfers/%3.0Lf} ", dev_select[dn].device_name[0], dev_select[dn].device_name[1], dev_select[dn].unit_number, transfers_per_second); xo_close_instance("device"); } xo_close_list("device"); } static void percent(const char *name, double pct, int *over) { char buf[10]; char fmt[128]; int l; snprintf(fmt, sizeof(fmt), " {:%s/%%*s}", name); l = snprintf(buf, sizeof(buf), "%.0f", pct); if (l == 1 && *over) { xo_emit(fmt, 1, buf); (*over)--; } else xo_emit(fmt, 2, buf); if (l > 2) (*over)++; } static void cpustats(void) { int state, over; double lpct, total; total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur.cp_time[state]; if (total) lpct = 100.0 / total; else lpct = 0.0; over = 0; xo_open_container("cpu-statistics"); percent("user", (cur.cp_time[CP_USER] + cur.cp_time[CP_NICE]) * lpct, &over); percent("system", (cur.cp_time[CP_SYS] + cur.cp_time[CP_INTR]) * lpct, &over); percent("idle", cur.cp_time[CP_IDLE] * lpct, &over); xo_close_container("cpu-statistics"); } static void pcpustats(int ncpus, u_long cpumask, int maxid) { int state, i; double lpct, total; long tmp; int over; /* devstats does this for cp_time */ for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; for (state = 0; state < CPUSTATES; ++state) { tmp = cur_cp_times[i * CPUSTATES + state]; cur_cp_times[i * CPUSTATES + state] -= last_cp_times[i * CPUSTATES + state]; last_cp_times[i * CPUSTATES + state] = tmp; } } over = 0; xo_open_list("cpu"); for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; xo_open_instance("cpu"); xo_emit("{ke:name/%d}", i); total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur_cp_times[i * CPUSTATES + state]; if (total) lpct = 100.0 / total; else lpct = 0.0; percent("user", (cur_cp_times[i * CPUSTATES + CP_USER] + cur_cp_times[i * CPUSTATES + CP_NICE]) * lpct, &over); percent("system", (cur_cp_times[i * CPUSTATES + CP_SYS] + cur_cp_times[i * CPUSTATES + CP_INTR]) * lpct, &over); percent("idle", cur_cp_times[i * CPUSTATES + CP_IDLE] * lpct, &over); xo_close_instance("cpu"); } xo_close_list("cpu"); } static unsigned int read_intrcnts(unsigned long **intrcnts) { size_t intrcntlen; if (kd != NULL) { kread(X_SINTRCNT, &intrcntlen, sizeof(intrcntlen)); if ((*intrcnts = malloc(intrcntlen)) == NULL) err(1, "malloc()"); kread(X_INTRCNT, *intrcnts, intrcntlen); } else { for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) { *intrcnts = reallocf(*intrcnts, intrcntlen); if (*intrcnts == NULL) err(1, "reallocf()"); if (mysysctl("hw.intrcnt", *intrcnts, &intrcntlen, NULL, 0) == 0) break; } } return (intrcntlen / sizeof(unsigned long)); } static void print_intrcnts(unsigned long *intrcnts, unsigned long *old_intrcnts, char *intrnames, unsigned int nintr, size_t istrnamlen, long long period_ms) { unsigned long *intrcnt, *old_intrcnt; uint64_t inttotal, old_inttotal, total_count, total_rate; char* intrname; unsigned int i; inttotal = 0; old_inttotal = 0; intrname = intrnames; xo_open_list("interrupt"); for (i = 0, intrcnt=intrcnts, old_intrcnt=old_intrcnts; i < nintr; i++) { if (intrname[0] != '\0' && (*intrcnt != 0 || aflag)) { unsigned long count, rate; count = *intrcnt - *old_intrcnt; rate = (count * 1000 + period_ms / 2) / period_ms; xo_open_instance("interrupt"); xo_emit("{d:name/%-*s}{ket:name/%s} " "{:total/%20lu} {:rate/%10lu}\n", (int)istrnamlen, intrname, intrname, count, rate); xo_close_instance("interrupt"); } intrname += strlen(intrname) + 1; inttotal += *intrcnt++; old_inttotal += *old_intrcnt++; } total_count = inttotal - old_inttotal; total_rate = (total_count * 1000 + period_ms / 2) / period_ms; xo_close_list("interrupt"); xo_emit("{L:/%-*s} {:total-interrupts/%20" PRIu64 "} " "{:total-rate/%10" PRIu64 "}\n", (int)istrnamlen, "Total", total_count, total_rate); } static void dointr(unsigned int interval, int reps) { unsigned long *intrcnts; long long uptime, period_ms; unsigned long *old_intrcnts = NULL; size_t clen, inamlen, istrnamlen; char *intrnames, *intrname; uptime = getuptime(); /* Get the names of each interrupt source */ if (kd != NULL) { kread(X_SINTRNAMES, &inamlen, sizeof(inamlen)); if ((intrnames = malloc(inamlen)) == NULL) xo_err(1, "malloc()"); kread(X_INTRNAMES, intrnames, inamlen); } else { for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) { if ((intrnames = reallocf(intrnames, inamlen)) == NULL) xo_err(1, "reallocf()"); if (mysysctl("hw.intrnames", intrnames, &inamlen, NULL, 0) == 0) break; } } /* Determine the length of the longest interrupt name */ intrname = intrnames; istrnamlen = strlen("interrupt"); while(*intrname != '\0') { clen = strlen(intrname); if (clen > istrnamlen) istrnamlen = clen; intrname += strlen(intrname) + 1; } xo_emit("{T:/%-*s} {T:/%20s} {T:/%10s}\n", (int)istrnamlen, "interrupt", "total", "rate"); /* * Loop reps times printing differential interrupt counts. If reps is * zero, then run just once, printing total counts */ xo_open_container("interrupt-statistics"); period_ms = uptime / 1000000; while(1) { unsigned int nintr; long long old_uptime; nintr = read_intrcnts(&intrcnts); /* * Initialize old_intrcnts to 0 for the first pass, so * print_intrcnts will print total interrupts since boot */ if (old_intrcnts == NULL) { old_intrcnts = calloc(nintr, sizeof(unsigned long)); if (old_intrcnts == NULL) xo_err(1, "calloc()"); } print_intrcnts(intrcnts, old_intrcnts, intrnames, nintr, istrnamlen, period_ms); xo_flush(); free(old_intrcnts); old_intrcnts = intrcnts; if (reps >= 0 && --reps <= 0) break; usleep(interval * 1000); old_uptime = uptime; uptime = getuptime(); period_ms = (uptime - old_uptime) / 1000000; } xo_close_container("interrupt-statistics"); } static void domemstat_malloc(void) { struct memory_type_list *mtlp; struct memory_type *mtp; int error, first, i; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_malloc(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_malloc: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_malloc(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_malloc: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_malloc: %s", memstat_strerror(error)); } } xo_open_container("malloc-statistics"); xo_emit("{T:/%13s} {T:/%5s} {T:/%6s} {T:/%7s} {T:/%8s} {T:Size(s)}\n", "Type", "InUse", "MemUse", "HighUse", "Requests"); xo_open_list("memory"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { if (memstat_get_numallocs(mtp) == 0 && memstat_get_count(mtp) == 0) continue; xo_open_instance("memory"); xo_emit("{k:type/%13s/%s} {:in-use/%5" PRIu64 "} " "{:memory-use/%5" PRIu64 "}{U:K} {:high-use/%7s} " "{:requests/%8" PRIu64 "} ", memstat_get_name(mtp), memstat_get_count(mtp), (memstat_get_bytes(mtp) + 1023) / 1024, "-", memstat_get_numallocs(mtp)); first = 1; xo_open_list("size"); for (i = 0; i < 32; i++) { if (memstat_get_sizemask(mtp) & (1 << i)) { if (!first) xo_emit(","); xo_emit("{l:size/%d}", 1 << (i + 4)); first = 0; } } xo_close_list("size"); xo_close_instance("memory"); xo_emit("\n"); } xo_close_list("memory"); xo_close_container("malloc-statistics"); memstat_mtl_free(mtlp); } static void domemstat_zone(void) { struct memory_type_list *mtlp; struct memory_type *mtp; char name[MEMTYPE_MAXNAME + 1]; int error; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_uma(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_uma: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_uma(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_uma: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_uma: %s", memstat_strerror(error)); } } xo_open_container("memory-zone-statistics"); xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} " "{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE", "LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP"); xo_open_list("zone"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { strlcpy(name, memstat_get_name(mtp), MEMTYPE_MAXNAME); strcat(name, ":"); xo_open_instance("zone"); xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6" PRIu64 "}, " "{:limit/%6" PRIu64 "},{:used/%8" PRIu64 "}," "{:free/%8" PRIu64 "},{:requests/%8" PRIu64 "}," "{:fail/%4" PRIu64 "},{:sleep/%4" PRIu64 "}\n", name, memstat_get_name(mtp), memstat_get_size(mtp), memstat_get_countlimit(mtp), memstat_get_count(mtp), memstat_get_free(mtp), memstat_get_numallocs(mtp), memstat_get_failures(mtp), memstat_get_sleeps(mtp)); xo_close_instance("zone"); } memstat_mtl_free(mtlp); xo_close_list("zone"); xo_close_container("memory-zone-statistics"); xo_emit("\n"); } static void display_object(struct kinfo_vmobject *kvo) { const char *str; xo_open_instance("object"); xo_emit("{:resident/%5jd} ", (uintmax_t)kvo->kvo_resident); xo_emit("{:active/%5jd} ", (uintmax_t)kvo->kvo_active); xo_emit("{:inactive/%5jd} ", (uintmax_t)kvo->kvo_inactive); xo_emit("{:refcount/%3d} ", kvo->kvo_ref_count); xo_emit("{:shadowcount/%3d} ", kvo->kvo_shadow_count); switch (kvo->kvo_memattr) { #ifdef VM_MEMATTR_UNCACHEABLE case VM_MEMATTR_UNCACHEABLE: str = "UC"; break; #endif #ifdef VM_MEMATTR_WRITE_COMBINING case VM_MEMATTR_WRITE_COMBINING: str = "WC"; break; #endif #ifdef VM_MEMATTR_WRITE_THROUGH case VM_MEMATTR_WRITE_THROUGH: str = "WT"; break; #endif #ifdef VM_MEMATTR_WRITE_PROTECTED case VM_MEMATTR_WRITE_PROTECTED: str = "WP"; break; #endif #ifdef VM_MEMATTR_WRITE_BACK case VM_MEMATTR_WRITE_BACK: str = "WB"; break; #endif #ifdef VM_MEMATTR_WEAK_UNCACHEABLE case VM_MEMATTR_WEAK_UNCACHEABLE: str = "UC-"; break; #endif #ifdef VM_MEMATTR_WB_WA case VM_MEMATTR_WB_WA: str = "WB"; break; #endif #ifdef VM_MEMATTR_NOCACHE case VM_MEMATTR_NOCACHE: str = "NC"; break; #endif #ifdef VM_MEMATTR_DEVICE case VM_MEMATTR_DEVICE: str = "DEV"; break; #endif #ifdef VM_MEMATTR_CACHEABLE case VM_MEMATTR_CACHEABLE: str = "C"; break; #endif #ifdef VM_MEMATTR_PREFETCHABLE case VM_MEMATTR_PREFETCHABLE: str = "PRE"; break; #endif default: str = "??"; break; } xo_emit("{:attribute/%-3s} ", str); switch (kvo->kvo_type) { case KVME_TYPE_NONE: str = "--"; break; case KVME_TYPE_DEFAULT: str = "df"; break; case KVME_TYPE_VNODE: str = "vn"; break; case KVME_TYPE_SWAP: str = "sw"; break; case KVME_TYPE_DEVICE: str = "dv"; break; case KVME_TYPE_PHYS: str = "ph"; break; case KVME_TYPE_DEAD: str = "dd"; break; case KVME_TYPE_SG: str = "sg"; break; case KVME_TYPE_UNKNOWN: default: str = "??"; break; } xo_emit("{:type/%-2s} ", str); xo_emit("{:path/%-s}\n", kvo->kvo_path); xo_close_instance("object"); } static void doobjstat(void) { struct kinfo_vmobject *kvo; int cnt, i; kvo = kinfo_getvmobject(&cnt); if (kvo == NULL) { xo_warn("Failed to fetch VM object list"); return; } xo_emit("{T:RES/%5s} {T:ACT/%5s} {T:INACT/%5s} {T:REF/%3s} {T:SHD/%3s} " "{T:CM/%3s} {T:TP/%2s} {T:PATH/%s}\n"); xo_open_list("object"); for (i = 0; i < cnt; i++) display_object(&kvo[i]); free(kvo); xo_close_list("object"); } /* * kread reads something from the kernel, given its nlist index. */ static void kreado(int nlx, void *addr, size_t size, size_t offset) { const char *sym; if (namelist[nlx].n_type == 0 || namelist[nlx].n_value == 0) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "symbol %s not defined", sym); } if ((size_t)kvm_read(kd, namelist[nlx].n_value + offset, addr, size) != size) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "%s: %s", sym, kvm_geterr(kd)); } } static void kread(int nlx, void *addr, size_t size) { kreado(nlx, addr, size, 0); } static char * kgetstr(const char *strp) { int n = 0, size = 1; char *ret = NULL; do { if (size == n + 1) { ret = realloc(ret, size); if (ret == NULL) xo_err(1, "%s: realloc", __func__); size *= 2; } if (kvm_read(kd, (u_long)strp + n, &ret[n], 1) != 1) xo_errx(1, "%s: %s", __func__, kvm_geterr(kd)); } while (ret[n++] != '\0'); return (ret); } static void usage(void) { xo_error("%s%s", "usage: vmstat [-afHhimoPsz] [-M core [-N system]] [-c count] [-n devs]\n", " [-p type,if,pass] [-w wait] [disks] [wait [count]]\n"); xo_finish(); exit(1); }