Index: head/sys/kern/kern_shutdown.c =================================================================== --- head/sys/kern/kern_shutdown.c (revision 285992) +++ head/sys/kern/kern_shutdown.c (revision 285993) @@ -1,888 +1,770 @@ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; #else static int trace_on_panic = 0; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, ""); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; int dumping; /* system is dumping */ int rebooting; /* system is rebooting */ static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { mtx_lock(&Giant); kern_reboot(uap->opt); mtx_unlock(&Giant); } return (error); } /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL) { /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if (howto & RB_POWEROFF) kern_psignal(initproc, SIGUSR2); else if (howto & RB_HALT) kern_psignal(initproc, SIGUSR1); else kern_psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } else { /* No init(8) running, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (dumper.dumper == NULL) return (ENXIO); savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(&dumper); } #endif if (coredump) error = dumpsys(&dumper); dumping--; return (error); } -static int -isbufbusy(struct buf *bp) -{ - if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 && - BUF_ISLOCKED(bp)) || - ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) - return (1); - return (0); -} - /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void kern_reboot(int howto) { - static int first_buf_printf = 1; - static int waittime = -1; + static int once = 0; #if defined(SMP) /* * Bind us to CPU 0 so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0")); } #endif /* We're in the process of rebooting. */ rebooting = 1; /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ - if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { - register struct buf *bp; - int iter, nbusy, pbusy; -#ifndef PREEMPTION - int subiter; -#endif - - waittime = 0; - - wdog_kern_pat(WD_LASTVAL); - sys_sync(curthread, NULL); - - /* - * With soft updates, some buffers that are - * written will be remarked as dirty until other - * buffers are written. - */ - for (iter = pbusy = 0; iter < 20; iter++) { - nbusy = 0; - for (bp = &buf[nbuf]; --bp >= buf; ) - if (isbufbusy(bp)) - nbusy++; - if (nbusy == 0) { - if (first_buf_printf) - printf("All buffers synced."); - break; - } - if (first_buf_printf) { - printf("Syncing disks, buffers remaining... "); - first_buf_printf = 0; - } - printf("%d ", nbusy); - if (nbusy < pbusy) - iter = 0; - pbusy = nbusy; - - wdog_kern_pat(WD_LASTVAL); - sys_sync(curthread, NULL); - -#ifdef PREEMPTION - /* - * Drop Giant and spin for a while to allow - * interrupt threads to run. - */ - DROP_GIANT(); - DELAY(50000 * iter); - PICKUP_GIANT(); -#else - /* - * Drop Giant and context switch several times to - * allow interrupt threads to run. - */ - DROP_GIANT(); - for (subiter = 0; subiter < 50 * iter; subiter++) { - thread_lock(curthread); - mi_switch(SW_VOL, NULL); - thread_unlock(curthread); - DELAY(1000); - } - PICKUP_GIANT(); -#endif - } - printf("\n"); - /* - * Count only busy local buffers to prevent forcing - * a fsck if we're just a client of a wedged NFS server - */ - nbusy = 0; - for (bp = &buf[nbuf]; --bp >= buf; ) { - if (isbufbusy(bp)) { -#if 0 -/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ - if (bp->b_dev == NULL) { - TAILQ_REMOVE(&mountlist, - bp->b_vp->v_mount, mnt_list); - continue; - } -#endif - nbusy++; - if (show_busybufs > 0) { - printf( - "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", - nbusy, bp, bp->b_vp, bp->b_flags, - (intmax_t)bp->b_blkno, - (intmax_t)bp->b_lblkno); - BUF_LOCKPRINTINFO(bp); - if (show_busybufs > 1) - vn_printf(bp->b_vp, - "vnode content: "); - } - } - } - if (nbusy) { - /* - * Failed to sync all blocks. Indicate this and don't - * unmount filesystems (thus forcing an fsck on reboot). - */ - printf("Giving up on %d buffers\n", nbusy); - DELAY(5000000); /* 5 seconds */ - } else { - if (!first_buf_printf) - printf("Final sync complete\n"); - /* - * Unmount filesystems - */ - if (panicstr == 0) - vfs_unmountall(); - } - swapoff_all(); - DELAY(100000); /* wait for console output to finish */ + if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { + once = 1; + bufshutdown(show_busybufs); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: howto &= ~RB_HALT; break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANTS) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (1) or just a warning (0)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RWTUN, &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RWTUN, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; #endif bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif #ifdef KDB if (newpanic && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if (!(howto & RB_POWEROFF) || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop...", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop...", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)]; SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD, dumpdevname, 0, "Device for kernel dumps"); /* Registration of dumpers */ int set_dumper(struct dumperinfo *di, const char *devname, struct thread *td) { size_t wantcopy; int error; error = priv_check(td, PRIV_SETDUMPER); if (error != 0) return (error); if (di == NULL) { bzero(&dumper, sizeof dumper); dumpdevname[0] = '\0'; return (0); } if (dumper.dumper != NULL) return (EBUSY); dumper = *di; wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname)); if (wantcopy >= sizeof(dumpdevname)) { printf("set_dumper: device name truncated from '%s' -> '%s'\n", devname, dumpdevname); } return (0); } /* Call dumper with bounds checking. */ int dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } return (di->dumper(di->priv, virtual, physical, offset, length)); } void mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver, uint64_t dumplen, uint32_t blksz) { bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); strlcpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); kdh->parity = kerneldump_parity(kdh); } Index: head/sys/kern/subr_param.c =================================================================== --- head/sys/kern/subr_param.c (revision 285992) +++ head/sys/kern/subr_param.c (revision 285993) @@ -1,300 +1,293 @@ /*- * Copyright (c) 1980, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.c 8.3 (Berkeley) 8/20/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_msgbuf.h" #include "opt_maxusers.h" #include #include #include #include #include #include #include #include #include #include /* * System parameter formulae. */ #ifndef HZ # if defined(__mips__) || defined(__arm__) # define HZ 100 # else # define HZ 1000 # endif # ifndef HZ_VM # define HZ_VM 100 # endif #else # ifndef HZ_VM # define HZ_VM HZ # endif #endif #define NPROC (20 + 16 * maxusers) #ifndef NBUF #define NBUF 0 #endif #ifndef MAXFILES #define MAXFILES (maxproc * 2) #endif static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS); int hz; /* system clock's frequency */ int tick; /* usec per tick (1000000 / hz) */ struct bintime tick_bt; /* bintime per tick (1s / hz) */ sbintime_t tick_sbt; int maxusers; /* base tunable */ int maxproc; /* maximum # of processes */ int maxprocperuid; /* max # of procs per user */ int maxfiles; /* sys. wide open files limit */ int maxfilesperproc; /* per-proc open files limit */ int msgbufsize; /* size of kernel message buffer */ int nbuf; int bio_transient_maxcnt; int ngroups_max; /* max # groups per process */ int nswbuf; pid_t pid_max = PID_MAX; long maxswzone; /* max swmeta KVA storage */ long maxbcache; /* max buffer cache KVA storage */ long maxpipekva; /* Limit on pipe KVA */ int vm_guest = VM_GUEST_NO; /* Running as virtual machine guest? */ u_long maxtsiz; /* max text size */ u_long dfldsiz; /* initial data size limit */ u_long maxdsiz; /* max data size */ u_long dflssiz; /* initial stack size limit */ u_long maxssiz; /* max stack size */ u_long sgrowsiz; /* amount to grow stack */ SYSCTL_INT(_kern, OID_AUTO, hz, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &hz, 0, "Number of clock ticks per second"); SYSCTL_INT(_kern, OID_AUTO, nbuf, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nbuf, 0, "Number of buffers in the buffer cache"); SYSCTL_INT(_kern, OID_AUTO, nswbuf, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nswbuf, 0, "Number of swap buffers"); SYSCTL_INT(_kern, OID_AUTO, msgbufsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &msgbufsize, 0, "Size of the kernel message buffer"); SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxswzone, 0, "Maximum memory for swap metadata"); SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxbcache, 0, "Maximum value of vfs.maxbufspace"); SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bio_transient_maxcnt, 0, "Maximum number of transient BIOs mappings"); SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &maxtsiz, 0, "Maximum text size"); SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &dfldsiz, 0, "Initial data size limit"); SYSCTL_ULONG(_kern, OID_AUTO, maxdsiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &maxdsiz, 0, "Maximum data size"); SYSCTL_ULONG(_kern, OID_AUTO, dflssiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &dflssiz, 0, "Initial stack size limit"); SYSCTL_ULONG(_kern, OID_AUTO, maxssiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &maxssiz, 0, "Maximum stack size"); SYSCTL_ULONG(_kern, OID_AUTO, sgrowsiz, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &sgrowsiz, 0, "Amount to grow stack on a stack fault"); SYSCTL_PROC(_kern, OID_AUTO, vm_guest, CTLFLAG_RD | CTLTYPE_STRING, NULL, 0, sysctl_kern_vm_guest, "A", "Virtual machine guest detected?"); /* - * These have to be allocated somewhere; allocating - * them here forces loader errors if this file is omitted - * (if they've been externed everywhere else; hah!). - */ -struct buf *swbuf; - -/* * The elements of this array are ordered based upon the values of the * corresponding enum VM_GUEST members. */ static const char *const vm_guest_sysctl_names[] = { "none", "generic", "xen", "hv", "vmware", NULL }; CTASSERT(nitems(vm_guest_sysctl_names) - 1 == VM_LAST); /* * Boot time overrides that are not scaled against main memory */ void init_param1(void) { hz = -1; TUNABLE_INT_FETCH("kern.hz", &hz); if (hz == -1) hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ; tick = 1000000 / hz; tick_sbt = SBT_1S / hz; tick_bt = sbttobt(tick_sbt); #ifdef VM_SWZONE_SIZE_MAX maxswzone = VM_SWZONE_SIZE_MAX; #endif TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone); #ifdef VM_BCACHE_SIZE_MAX maxbcache = VM_BCACHE_SIZE_MAX; #endif TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache); msgbufsize = MSGBUF_SIZE; TUNABLE_INT_FETCH("kern.msgbufsize", &msgbufsize); maxtsiz = MAXTSIZ; TUNABLE_ULONG_FETCH("kern.maxtsiz", &maxtsiz); dfldsiz = DFLDSIZ; TUNABLE_ULONG_FETCH("kern.dfldsiz", &dfldsiz); maxdsiz = MAXDSIZ; TUNABLE_ULONG_FETCH("kern.maxdsiz", &maxdsiz); dflssiz = DFLSSIZ; TUNABLE_ULONG_FETCH("kern.dflssiz", &dflssiz); maxssiz = MAXSSIZ; TUNABLE_ULONG_FETCH("kern.maxssiz", &maxssiz); sgrowsiz = SGROWSIZ; TUNABLE_ULONG_FETCH("kern.sgrowsiz", &sgrowsiz); /* * Let the administrator set {NGROUPS_MAX}, but disallow values * less than NGROUPS_MAX which would violate POSIX.1-2008 or * greater than INT_MAX-1 which would result in overflow. */ ngroups_max = NGROUPS_MAX; TUNABLE_INT_FETCH("kern.ngroups", &ngroups_max); if (ngroups_max < NGROUPS_MAX) ngroups_max = NGROUPS_MAX; /* * Only allow to lower the maximal pid. * Prevent setting up a non-bootable system if pid_max is too low. */ TUNABLE_INT_FETCH("kern.pid_max", &pid_max); if (pid_max > PID_MAX) pid_max = PID_MAX; else if (pid_max < 300) pid_max = 300; TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed); } /* * Boot time overrides that are scaled against main memory */ void init_param2(long physpages) { /* Base parameters */ maxusers = MAXUSERS; TUNABLE_INT_FETCH("kern.maxusers", &maxusers); if (maxusers == 0) { maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE); if (maxusers < 32) maxusers = 32; #ifdef VM_MAX_AUTOTUNE_MAXUSERS if (maxusers > VM_MAX_AUTOTUNE_MAXUSERS) maxusers = VM_MAX_AUTOTUNE_MAXUSERS; #endif /* * Scales down the function in which maxusers grows once * we hit 384. */ if (maxusers > 384) maxusers = 384 + ((maxusers - 384) / 8); } /* * The following can be overridden after boot via sysctl. Note: * unless overriden, these macros are ultimately based on maxusers. * Limit maxproc so that kmap entries cannot be exhausted by * processes. */ maxproc = NPROC; TUNABLE_INT_FETCH("kern.maxproc", &maxproc); if (maxproc > (physpages / 12)) maxproc = physpages / 12; maxprocperuid = (maxproc * 9) / 10; /* * The default limit for maxfiles is 1/12 of the number of * physical page but not less than 16 times maxusers. * At most it can be 1/6 the number of physical pages. */ maxfiles = imax(MAXFILES, physpages / 8); TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles); if (maxfiles > (physpages / 4)) maxfiles = physpages / 4; maxfilesperproc = (maxfiles / 10) * 9; /* * Cannot be changed after boot. */ nbuf = NBUF; TUNABLE_INT_FETCH("kern.nbuf", &nbuf); TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt); /* * The default for maxpipekva is min(1/64 of the kernel address space, * max(1/64 of main memory, 512KB)). See sys_pipe.c for more details. */ maxpipekva = (physpages / 64) * PAGE_SIZE; TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva); if (maxpipekva < 512 * 1024) maxpipekva = 512 * 1024; if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64) maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64; } /* * Sysctl stringifying handler for kern.vm_guest. */ static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT_STR(req, vm_guest_sysctl_names[vm_guest])); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 285992) +++ head/sys/kern/vfs_bio.c (revision 285993) @@ -1,4584 +1,4712 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include #include #include #include +#include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; -/* - * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has - * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. - */ -struct buf *buf; /* buffer header pool */ +static struct buf *buf; /* buffer header pool */ +extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); /* * Lock for the non-dirty bufqueues */ static struct mtx_padalign bqclean; /* * Lock for the dirty queue. */ static struct mtx_padalign bqdirty; /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 4 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_EMPTY 3 /* empty buffer headers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { if (qindex == QUEUE_DIRTY) return (struct mtx *)(&bqdirty); return (struct mtx *)(&bqclean); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { int need_wakeup, on; /* * If someone is waiting for bufspace, wake them up. Even * though we may not have freed the kva space yet, the waiting * process will be able to now. */ rw_rlock(&nblock); for (;;) { need_wakeup = 0; on = needsbuffer; if ((on & VFS_BIO_NEED_BUFSPACE) == 0) break; need_wakeup = 1; if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~VFS_BIO_NEED_BUFSPACE)) break; } if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } /* * bufspaceadjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspaceadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspaceadjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspacewakeup(); } else atomic_add_long(&bufspace, diff); bp->b_bufsize = bufsize; } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufmallocspace, -diff); bufspacewakeup(); } else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * bufcountadd: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountadd(struct buf *bp) { int mask, need_wakeup, old, on; KASSERT((bp->b_flags & B_INFREECNT) == 0, ("buf %p already counted as free", bp)); bp->b_flags |= B_INFREECNT; old = atomic_fetchadd_int(&numfreebuffers, 1); KASSERT(old >= 0 && old < nbuf, ("numfreebuffers climbed to %d", old + 1)); mask = VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) mask |= VFS_BIO_NEED_FREE; rw_rlock(&nblock); for (;;) { need_wakeup = 0; on = needsbuffer; if (on == 0) break; need_wakeup = 1; if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) break; } if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } /* * bufcountsub: * * Decrement the numfreebuffers count as needed. */ static void bufcountsub(struct buf *bp) { int old; /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, the buffer was free and we must decrement * numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { KASSERT((bp->b_flags & B_INFREECNT) != 0, ("buf %p not counted in numfreebuffers", bp)); bp->b_flags &= ~B_INFREECNT; old = atomic_fetchadd_int(&numfreebuffers, -1); KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); } } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artifically limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL | B_INFREECNT; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); lobufspace = hibufspace - MAXBCACHEBUF; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data > unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); +} +static int +isbufbusy(struct buf *bp) +{ + if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 && + BUF_ISLOCKED(bp)) || + ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) + return (1); + return (0); +} + +/* + * Shutdown the system cleanly to prepare for reboot, halt, or power off. + */ +void +bufshutdown(int show_busybufs) +{ + static int first_buf_printf = 1; + struct buf *bp; + int iter, nbusy, pbusy; +#ifndef PREEMPTION + int subiter; +#endif + + /* + * Sync filesystems for shutdown + */ + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + for (iter = pbusy = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) + if (isbufbusy(bp)) + nbusy++; + if (nbusy == 0) { + if (first_buf_printf) + printf("All buffers synced."); + break; + } + if (first_buf_printf) { + printf("Syncing disks, buffers remaining... "); + first_buf_printf = 0; + } + printf("%d ", nbusy); + if (nbusy < pbusy) + iter = 0; + pbusy = nbusy; + + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + +#ifdef PREEMPTION + /* + * Drop Giant and spin for a while to allow + * interrupt threads to run. + */ + DROP_GIANT(); + DELAY(50000 * iter); + PICKUP_GIANT(); +#else + /* + * Drop Giant and context switch several times to + * allow interrupt threads to run. + */ + DROP_GIANT(); + for (subiter = 0; subiter < 50 * iter; subiter++) { + thread_lock(curthread); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + DELAY(1000); + } + PICKUP_GIANT(); +#endif + } + printf("\n"); + /* + * Count only busy local buffers to prevent forcing + * a fsck if we're just a client of a wedged NFS server + */ + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if (isbufbusy(bp)) { +#if 0 +/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ + if (bp->b_dev == NULL) { + TAILQ_REMOVE(&mountlist, + bp->b_vp->v_mount, mnt_list); + continue; + } +#endif + nbusy++; + if (show_busybufs > 0) { + printf( + "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", + nbusy, bp, bp->b_vp, bp->b_flags, + (intmax_t)bp->b_blkno, + (intmax_t)bp->b_lblkno); + BUF_LOCKPRINTINFO(bp); + if (show_busybufs > 1) + vn_printf(bp->b_vp, + "vnode content: "); + } + } + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("Giving up on %d buffers\n", nbusy); + DELAY(5000000); /* 5 seconds */ + } else { + if (!first_buf_printf) + printf("Final sync complete\n"); + /* + * Unmount filesystems + */ + if (panicstr == 0) + vfs_unmountall(); + } + swapoff_all(); + DELAY(100000); /* wait for console output to finish */ } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; BUF_ASSERT_XLOCKED(bp); nlock = bqlock(qindex); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountadd(bp); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; bufcountsub(bp); } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer * from the queue and return the stats are already done. */ if (bp->b_flags & B_REMFREE) { bp->b_flags &= ~B_REMFREE; return; } bufcountsub(bp); } /* * bufkvafree: * * Free the kva allocation for a buffer. * */ static void bufkvafree(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkvaalloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkvaalloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkvafree(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ atomic_add_int(&bufdefragcnt, 1); return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); if (bp->b_pin_count > 0) bunpin_wait(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && bp->b_error == EIO && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If the error is anything * other than an I/O error (EIO), assume that retrying * is futile. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; obj = bp->b_bufobj->bo_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; VM_OBJECT_RLOCK(obj); for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } VM_OBJECT_RUNLOCK(obj); if ((bp->b_flags & B_INVAL) == 0 && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); VM_OBJECT_WLOCK(obj); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh"); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); VM_OBJECT_WUNLOCK(obj); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { if (bp->b_bufsize != 0) allocbuf(bp, 0); if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); bufkvafree(bp); qindex = QUEUE_EMPTY; bp->b_flags |= B_AGE; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: /* unlock */ BUF_UNLOCK(bp); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(struct buf *bp) { vm_object_t obj; vm_page_t m; int i; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_lock(m); vm_page_unwire(m, PQ_INACTIVE); /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) { if (m->wire_count == 0 && !vm_page_busied(m)) vm_page_free(m); } else if (bp->b_flags & B_DIRECT) vm_page_try_to_free(m); vm_page_unlock(m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); if (bp->b_bufsize) bufspaceadjust(bp, 0); bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * Ask the bufdaemon for help, or act as bufdaemon itself, when a * locked vnode is supplied. */ static void getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, int defrag) { struct thread *td; char *waitmsg; int error, fl, flags, norunbuf; mtx_assert(&bqclean, MA_OWNED); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } atomic_set_int(&needsbuffer, flags); mtx_unlock(&bqclean); bd_speedup(); /* heeeelp */ if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while ((needsbuffer & flags) != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if ((needsbuffer & flags) == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } static void getnewbuf_reuse_bp(struct buf *bp, int qindex) { CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, bp->b_kvasize, bp->b_bufsize, qindex); mtx_assert(&bqclean, MA_NOTOWNED); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp != NULL) brelvp(bp); } /* * Get the rest of the buffer freed up. b_kva* is still valid * after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", bp, bp->b_vp, qindex)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; KASSERT((bp->b_flags & B_INFREECNT) == 0, ("buf %p still counted as free?", bp)); bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_pin_count = 0; bp->b_data = bp->b_kvabase; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); } static struct buf * getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) { struct buf *bp, *nbp; int nqindex, qindex, pass; KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); pass = 0; restart: if (pass != 0) atomic_add_int(&getnewbufrestarts, 1); nbp = NULL; mtx_lock(&bqclean); /* * If we're not defragging or low on bufspace attempt to make a new * buf from a header. */ if (defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[nqindex]); } /* * All available buffers might be clean or we need to start recycling. */ if (nbp == NULL) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { qindex = nqindex; /* * Calculate next bp (we can only use it if we do not * release the bqlock) */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch (qindex) { case QUEUE_EMPTY: nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[nqindex]); if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_CLEAN: if (metadata && pass == 0) { pass = 1; nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[nqindex]); } /* * nbp is NULL. */ break; } } /* * If we are defragging then we need a buffer with * b_kvasize != 0. This situation occurs when we * have many unmapped bufs. */ if (defrag && bp->b_kvasize == 0) continue; /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if (bp->b_vflags & BV_BKGRDINPROG) { BUF_UNLOCK(bp); continue; } /* * Requeue the background write buffer with error. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bremfreel(bp); mtx_unlock(&bqclean); bqrelse(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); bremfreel(bp); mtx_unlock(&bqclean); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ getnewbuf_reuse_bp(bp, qindex); mtx_assert(&bqclean, MA_NOTOWNED); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; brelse(bp); defrag = 0; goto restart; } /* * Notify any waiters for the buffer lock about * identity change by freeing the buffer. */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { bp->b_flags |= B_INVAL; brelse(bp); goto restart; } if (metadata) break; /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; brelse(bp); goto restart; } break; } return (bp); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, int gbflags) { struct buf *bp; int defrag, metadata; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); defrag = 0; if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = 1; else metadata = 0; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); restart: bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED, metadata); if (bp != NULL) defrag = 0; /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { mtx_assert(&bqclean, MA_OWNED); getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); mtx_assert(&bqclean, MA_NOTOWNED); } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { mtx_assert(&bqclean, MA_NOTOWNED); bufkvafree(bp); atomic_add_int(&bufreusecnt, 1); } else { mtx_assert(&bqclean, MA_NOTOWNED); /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order to * keep fragmentation sane we only allocate kva in BKVASIZE * chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkvaalloc(bp, maxsize, gbflags)) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED | GB_KVAALLOC)) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } atomic_add_int(&bufreusecnt, 1); } return (bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqdirty); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqdirty); while (flushed != target) { maybe_yield(); mtx_lock(&bqdirty); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqdirty); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqdirty); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqdirty); if (error != 0) continue; if (bp->b_pin_count > 0) { BUF_UNLOCK(bp); continue; } /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqdirty); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqdirty); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { struct buf *scratch_bp; int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); mapping_loop: if (bufkvaalloc(bp, maxsize, gbflags)) { /* * Request defragmentation. getnewbuf() returns us the * allocated space by the scratch buffer KVA. */ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | (GB_UNMAPPED | GB_KVAALLOC)); if (scratch_bp == NULL) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); goto mapping_loop; } KASSERT(scratch_bp->b_kvabase != unmapped_buf, ("scratch bp has no KVA %p", scratch_bp)); /* Grab pointers. */ bp->b_kvabase = scratch_bp->b_kvabase; bp->b_kvasize = scratch_bp->b_kvasize; bp->b_data = scratch_bp->b_data; /* Get rid of the scratch buffer. */ scratch_bp->b_kvasize = 0; scratch_bp->b_flags |= B_INVAL; scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf; brelse(scratch_bp); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBCACHEBUF) panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, MAXBCACHEBUF); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { /* * If buffer is pinned and caller does * not want sleep waiting for it to be * unpinned, bail out * */ if (bp->b_pin_count > 0) { if (flags & GB_LOCK_NOWAIT) { bqrelse(bp); return (NULL); } else { bunpin_wait(bp); } } bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; BUF_ASSERT_HELD(bp); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); bufmallocadjust(bp, 0); bp->b_data = bp->b_kvabase; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages(bp, newbsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ((bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufmallocadjust(bp, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page( (vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); } else BUF_CHECK_UNMAPPED(bp); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, "biodep")) continue; bp->b_pages[i] = NULL; vm_page_lock(m); vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } } /* Record changes in allocation size. */ if (bp->b_bufsize != newbsize) bufspaceadjust(bp, newbsize); bp->b_bcount = size; /* requested buffer size. */ return 1; } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, OFF_TO_IDX(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else { bp->bio_flags |= BIO_DONE; done(bp); } } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. */ static void bufdonebio(struct bio *bip) { struct buf *bp; bp = bip->bio_caller2; bp->b_resid = bip->bio_resid; bp->b_ioflags = bip->bio_flags; bp->b_error = bip->bio_error; if (bp->b_error) bp->b_ioflags |= BIO_ERROR; bufdone(bp); g_destroy_bio(bip); } void dev_strategy(struct cdev *dev, struct buf *bp) { struct cdevsw *csw; int ref; KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s) %p", devtoname(dev), dev)); csw = dev_refthread(dev, &ref); dev_strategy_csw(dev, csw, bp); dev_relthread(dev, ref); } void dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp) { struct bio *bip; KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE, ("b_iocmd botch")); KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) || dev->si_threadcount > 0, ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev), dev)); if (csw == NULL) { bp->b_error = ENXIO; bp->b_ioflags = BIO_ERROR; bufdone(bp); return; } for (;;) { bip = g_new_bio(); if (bip != NULL) break; /* Try again later */ tsleep(&bp, PRIBIO, "dev_strat", hz/10); } bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ bdata2bio(bp, bip); bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; (*csw->d_strategy)(bip); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int bogus, i, iosize; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("biodone_finish: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("biodone_finish: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("biodone_finish: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone_finish: bp %p has no buffer offset", bp)); /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } bogus = 0; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("biodone_finish: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("bufdone_finish:" " page %p has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } vm_page_sunbusy(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_object_pip_subtract(obj, 1); vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage"); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); p->wire_count--; vm_page_free(p); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } void bpin(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_pin_count++; mtx_unlock(mtxp); } void bunpin(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); if (--bp->b_pin_count == 0) wakeup(bp); mtx_unlock(mtxp); } void bunpin_wait(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while (bp->b_pin_count > 0) msleep(bp, mtxp, PRIBIO, "bwunpin", 0); mtx_unlock(mtxp); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if ((bp->b_flags & B_INFREECNT) != 0) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 285992) +++ head/sys/sys/buf.h (revision 285993) @@ -1,554 +1,553 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; typedef unsigned char b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint8_t b_iocmd; uint8_t b_ioflags; off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) bu_freelist; /* (Q) */ struct { void (*pg_iodone)(void *, vm_page_t *, int, int); int pg_reqpage; } bu_pager; } b_union; #define b_freelist b_union.bu_freelist #define b_pager b_union.bu_pager union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; int b_pin_count; }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_00000800 0x00000800 /* Available flag. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_000400000 0x00040000 /* Available flag. */ #define B_000800000 0x00080000 /* Available flag. */ #define B_001000000 0x00100000 /* Available flag. */ #define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14b11\13eintr\12done\11persist\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif /* * Find out if the lock has waiters or not. */ #define BUF_LOCKWAITERS(bp) \ lockmgr_waiters(&(bp)->b_lock) #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; -extern struct buf *buf; /* The buffer headers. */ -extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); +void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); void bpin(struct buf *); void bunpin(struct buf *); void bunpin_wait(struct buf *); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: head/sys/ufs/ffs/ffs_subr.c =================================================================== --- head/sys/ufs/ffs/ffs_subr.c (revision 285992) +++ head/sys/ufs/ffs/ffs_subr.c (revision 285993) @@ -1,414 +1,379 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95 */ #include __FBSDID("$FreeBSD$"); #include #ifndef _KERNEL #include #include #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#ifdef KDB -void ffs_checkoverlap(struct buf *, struct inode *); -#endif - /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ int ffs_blkatoff(vp, offset, res, bpp) struct vnode *vp; off_t offset; char **res; struct buf **bpp; { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ip->i_fs; lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ void ffs_load_inode(bp, ip, fs, ino) struct buf *bp; struct inode *ip; struct fs *fs; ino_t ino; { if (ip->i_ump->um_fstype == UFS1) { *ip->i_din1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = ip->i_din1->di_mode; ip->i_nlink = ip->i_din1->di_nlink; ip->i_size = ip->i_din1->di_size; ip->i_flags = ip->i_din1->di_flags; ip->i_gen = ip->i_din1->di_gen; ip->i_uid = ip->i_din1->di_uid; ip->i_gid = ip->i_din1->di_gid; } else { *ip->i_din2 = *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = ip->i_din2->di_mode; ip->i_nlink = ip->i_din2->di_nlink; ip->i_size = ip->i_din2->di_size; ip->i_flags = ip->i_din2->di_flags; ip->i_gen = ip->i_din2->di_gen; ip->i_uid = ip->i_din2->di_uid; ip->i_gid = ip->i_din2->di_gid; } } #endif /* KERNEL */ /* * Update the frsum fields to reflect addition or deletion * of some frags. */ void ffs_fragacct(fs, fragmap, fraglist, cnt) struct fs *fs; int fragmap; int32_t fraglist[]; int cnt; { int inblk; int field, subfield; int siz, pos; inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; fragmap <<= 1; for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) continue; field = around[siz]; subfield = inside[siz]; for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) { fraglist[siz] += cnt; pos += siz; field <<= siz; subfield <<= siz; } field <<= 1; subfield <<= 1; } } } - -#ifdef KDB -void -ffs_checkoverlap(bp, ip) - struct buf *bp; - struct inode *ip; -{ - struct buf *ebp, *ep; - ufs2_daddr_t start, last; - struct vnode *vp; - - ebp = &buf[nbuf]; - start = bp->b_blkno; - last = start + btodb(bp->b_bcount) - 1; - for (ep = buf; ep < ebp; ep++) { - if (ep == bp || (ep->b_flags & B_INVAL) || - ep->b_vp == NULLVP) - continue; - vp = ip->i_devvp; - /* look for overlap */ - if (ep->b_bcount == 0 || ep->b_blkno > last || - ep->b_blkno + btodb(ep->b_bcount) <= start) - continue; - vprint("Disk overlap", vp); - printf("\tstart %jd, end %jd overlap start %jd, end %jd\n", - (intmax_t)start, (intmax_t)last, (intmax_t)ep->b_blkno, - (intmax_t)(ep->b_blkno + btodb(ep->b_bcount) - 1)); - panic("ffs_checkoverlap: Disk buffer overlap"); - } -} -#endif /* KDB */ /* * block operations * * check if a block is available */ int ffs_isblock(fs, cp, h) struct fs *fs; unsigned char *cp; ufs1_daddr_t h; { unsigned char mask; switch ((int)fs->fs_frag) { case 8: return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: #ifdef _KERNEL panic("ffs_isblock"); #endif break; } return (0); } /* * check if a block is free */ int ffs_isfreeblock(fs, cp, h) struct fs *fs; u_char *cp; ufs1_daddr_t h; { switch ((int)fs->fs_frag) { case 8: return (cp[h] == 0); case 4: return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); case 2: return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); case 1: return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); default: #ifdef _KERNEL panic("ffs_isfreeblock"); #endif break; } return (0); } /* * take a block out of the map */ void ffs_clrblock(fs, cp, h) struct fs *fs; u_char *cp; ufs1_daddr_t h; { switch ((int)fs->fs_frag) { case 8: cp[h] = 0; return; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: #ifdef _KERNEL panic("ffs_clrblock"); #endif break; } } /* * put a block into the map */ void ffs_setblock(fs, cp, h) struct fs *fs; unsigned char *cp; ufs1_daddr_t h; { switch ((int)fs->fs_frag) { case 8: cp[h] = 0xff; return; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: #ifdef _KERNEL panic("ffs_setblock"); #endif break; } } /* * Update the cluster map because of an allocation or free. * * Cnt == 1 means free; cnt == -1 means allocating. */ void ffs_clusteracct(fs, cgp, blkno, cnt) struct fs *fs; struct cg *cgp; ufs1_daddr_t blkno; int cnt; { int32_t *sump; int32_t *lp; u_char *freemapp, *mapp; int i, start, end, forw, back, map, bit; if (fs->fs_contigsumsize <= 0) return; freemapp = cg_clustersfree(cgp); sump = cg_clustersum(cgp); /* * Allocate or clear the actual block. */ if (cnt > 0) setbit(freemapp, blkno); else clrbit(freemapp, blkno); /* * Find the size of the cluster going forward. */ start = blkno + 1; end = start + fs->fs_contigsumsize; if (end >= cgp->cg_nclusterblks) end = cgp->cg_nclusterblks; mapp = &freemapp[start / NBBY]; map = *mapp++; bit = 1 << (start % NBBY); for (i = start; i < end; i++) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } forw = i - start; /* * Find the size of the cluster going backward. */ start = blkno - 1; end = start - fs->fs_contigsumsize; if (end < 0) end = -1; mapp = &freemapp[start / NBBY]; map = *mapp--; bit = 1 << (start % NBBY); for (i = start; i > end; i--) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1 << (NBBY - 1); } } back = start - i; /* * Account for old cluster and the possibly new forward and * back clusters. */ i = back + forw + 1; if (i > fs->fs_contigsumsize) i = fs->fs_contigsumsize; sump[i] += cnt; if (back > 0) sump[back] -= cnt; if (forw > 0) sump[forw] -= cnt; /* * Update cluster summary information. */ lp = &sump[fs->fs_contigsumsize]; for (i = fs->fs_contigsumsize; i > 0; i--) if (*lp-- > 0) break; fs->fs_maxcluster[cgp->cg_cgx] = i; } Index: head/sys/vm/vm_pager.c =================================================================== --- head/sys/vm/vm_pager.c (revision 285992) +++ head/sys/vm/vm_pager.c (revision 285993) @@ -1,593 +1,595 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ +struct buf *swbuf; + static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dead_pager_dealloc(vm_object_t); static int dead_pager_getpages(obj, ma, count, req) vm_object_t obj; vm_page_t *ma; int count; int req; { return VM_PAGER_FAIL; } static vm_object_t dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { return NULL; } static void dead_pager_putpages(object, m, count, flags, rtvals) vm_object_t object; vm_page_t *m; int count; int flags; int *rtvals; { int i; for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_AGAIN; } } static int dead_pager_haspage(object, pindex, prev, next) vm_object_t object; vm_pindex_t pindex; int *prev; int *next; { if (prev) *prev = 0; if (next) *next = 0; return FALSE; } static void dead_pager_dealloc(object) vm_object_t object; { return; } static struct pagerops deadpagerops = { .pgo_alloc = dead_pager_alloc, .pgo_dealloc = dead_pager_dealloc, .pgo_getpages = dead_pager_getpages, .pgo_putpages = dead_pager_putpages, .pgo_haspage = dead_pager_haspage, }; struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &physpagerops, /* OBJT_PHYS */ &deadpagerops, /* OBJT_DEAD */ &sgpagerops, /* OBJT_SG */ &mgtdevicepagerops, /* OBJT_MGTDEVICE */ }; static const int npagers = sizeof(pagertab) / sizeof(pagertab[0]); /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ struct mtx_padalign pbuf_mtx; static TAILQ_HEAD(swqueue, buf) bswlist; static int bswneeded; vm_offset_t swapbkva; /* swap buffers kva */ void vm_pager_init() { struct pagerops **pgops; TAILQ_INIT(&bswlist); /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if ((*pgops)->pgo_init != NULL) (*(*pgops)->pgo_init) (); } void vm_pager_bufferinit() { struct buf *bp; int i; mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF); bp = swbuf; /* * Now set up swap and physical I/O buffer headers. */ for (i = 0; i < nswbuf; i++, bp++) { TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; } cluster_pbuf_freecnt = nswbuf / 2; vnode_pbuf_freecnt = nswbuf / 2 + 1; vnode_async_pbuf_freecnt = nswbuf / 2; } /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { vm_object_t ret; struct pagerops *ops; ops = pagertab[type]; if (ops) ret = (*ops->pgo_alloc) (handle, size, prot, off, cred); else ret = NULL; return (ret); } /* * The object must be locked. */ void vm_pager_deallocate(object) vm_object_t object; { VM_OBJECT_ASSERT_WLOCKED(object); (*pagertab[object->type]->pgo_dealloc) (object); } static void vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count) { #ifdef INVARIANTS VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(count > 0, ("%s: 0 count", __func__)); /* * All pages must be busied, not mapped, not fully valid, * not dirty and belong to the proper object. */ for (int i = 0 ; i < count; i++) { vm_page_assert_xbusied(m[i]); KASSERT(!pmap_page_is_mapped(m[i]), ("%s: page %p is mapped", __func__, m[i])); KASSERT(m[i]->valid != VM_PAGE_BITS_ALL, ("%s: request for a valid page %p", __func__, m[i])); KASSERT(m[i]->dirty == 0, ("%s: page %p is dirty", __func__, m[i])); KASSERT(m[i]->object == object, ("%s: wrong object %p/%p", __func__, object, m[i]->object)); } #endif } /* * Page in the pages for the object using its associated pager. * The requested page must be fully valid on successful return. */ int vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int reqpage) { int r; vm_pager_assert_in(object, m, count); r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage); if (r != VM_PAGER_OK) return (r); /* * If pager has replaced the page, assert that it had * updated the array. Also assert that page is still * busied. */ KASSERT(m[reqpage] == vm_page_lookup(object, m[reqpage]->pindex), ("%s: mismatch page %p pindex %ju", __func__, m[reqpage], (uintmax_t )m[reqpage]->pindex)); vm_page_assert_xbusied(m[reqpage]); /* * Pager didn't fill up entire page. Zero out * partially filled data. */ if (m[reqpage]->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m[reqpage], TRUE); return (VM_PAGER_OK); } int vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count, int reqpage, pgo_getpages_iodone_t iodone, void *arg) { vm_pager_assert_in(object, m, count); return ((*pagertab[object->type]->pgo_getpages_async)(object, m, count, reqpage, iodone, arg)); } /* * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h */ /* * Search the specified pager object list for an object with the * specified handle. If an object with the specified handle is found, * increase its reference count and return it. Otherwise, return NULL. * * The pager object list must be locked. */ vm_object_t vm_pager_object_lookup(struct pagerlst *pg_list, void *handle) { vm_object_t object; TAILQ_FOREACH(object, pg_list, pager_object_list) { if (object->handle == handle) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); break; } VM_OBJECT_WUNLOCK(object); } } return (object); } /* * Free the non-requested pages from the given array. To remove all pages, * caller should provide out of range reqpage number. */ void vm_pager_free_nonreq(vm_object_t object, vm_page_t ma[], int reqpage, int npages, boolean_t object_locked) { enum { UNLOCKED, CALLER_LOCKED, INTERNALLY_LOCKED } locked; int i; if (object_locked) { VM_OBJECT_ASSERT_WLOCKED(object); locked = CALLER_LOCKED; } else { VM_OBJECT_ASSERT_UNLOCKED(object); locked = UNLOCKED; } for (i = 0; i < npages; ++i) { if (i != reqpage) { if (locked == UNLOCKED) { VM_OBJECT_WLOCK(object); locked = INTERNALLY_LOCKED; } vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } } if (locked == INTERNALLY_LOCKED) VM_OBJECT_WUNLOCK(object); } /* * initialize a physical buffer */ /* * XXX This probably belongs in vfs_bio.c */ static void initpbuf(struct buf *bp) { KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj")); KASSERT(bp->b_vp == NULL, ("initpbuf with vp")); bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_kvabase = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; bp->b_data = bp->b_kvabase; bp->b_kvasize = MAXPHYS; bp->b_flags = 0; bp->b_xflags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); } /* * allocate a physical buffer * * There are a limited number (nswbuf) of physical buffers. We need * to make sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A * subsystem may initialize the counter to -1 to disable the feature, * but it must still be sure to match up all uses of getpbuf() with * relpbuf() using the same variable. * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * getpbuf(int *pfreecnt) { struct buf *bp; mtx_lock(&pbuf_mtx); for (;;) { if (pfreecnt) { while (*pfreecnt == 0) { msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); } } /* get a bp from the swap buffer header pool */ if ((bp = TAILQ_FIRST(&bswlist)) != NULL) break; bswneeded = 1; msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); /* loop in case someone else grabbed one */ } TAILQ_REMOVE(&bswlist, bp, b_freelist); if (pfreecnt) --*pfreecnt; mtx_unlock(&pbuf_mtx); initpbuf(bp); return bp; } /* * allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this * call understand how to use pfreecnt. */ struct buf * trypbuf(int *pfreecnt) { struct buf *bp; mtx_lock(&pbuf_mtx); if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { mtx_unlock(&pbuf_mtx); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); --*pfreecnt; mtx_unlock(&pbuf_mtx); initpbuf(bp); return bp; } /* * release a physical buffer * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ void relpbuf(struct buf *bp, int *pfreecnt) { if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } KASSERT(bp->b_vp == NULL, ("relpbuf with vp")); KASSERT(bp->b_bufobj == NULL, ("relpbuf with bufobj")); BUF_UNLOCK(bp); mtx_lock(&pbuf_mtx); TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); if (bswneeded) { bswneeded = 0; wakeup(&bswneeded); } if (pfreecnt) { if (++*pfreecnt == 1) wakeup(pfreecnt); } mtx_unlock(&pbuf_mtx); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); KASSERT(bp->b_bufobj == NULL, ("pbgetvp: not free (bufobj)")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_bufobj = &vp->v_bufobj; } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetbo(struct bufobj *bo, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetbo: not free (vnode)")); KASSERT(bp->b_bufobj == NULL, ("pbgetbo: not free (bufobj)")); bp->b_flags |= B_PAGING; bp->b_bufobj = bo; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(struct buf *bp) { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelvp: pager buf on vnode list.")); bp->b_vp = NULL; bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; } /* * Disassociate a p-buffer from a bufobj. */ void pbrelbo(struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode")); KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelbo: pager buf on vnode list.")); bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; }