diff --git a/sys/alpha/include/param.h b/sys/alpha/include/param.h index 6e36e70db9ab..1a9c9af545a5 100644 --- a/sys/alpha/include/param.h +++ b/sys/alpha/include/param.h @@ -1,143 +1,138 @@ /* $FreeBSD$ */ /* From: NetBSD: param.h,v 1.20 1997/09/19 13:52:53 leo Exp */ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: machparam.h 1.11 89/08/14$ * * @(#)param.h 8.1 (Berkeley) 6/10/93 */ /* * Machine dependent constants for the Alpha. */ /* * Round p (pointer or byte index) up to a correctly-aligned value for all * data types (int, long, ...). The result is u_long and must be cast to * any desired pointer type. * * ALIGNED_POINTER is a boolean macro that checks whether an address * is valid to fetch data elements of type t from on this architecture. * This does not reflect the optimal alignment, just the possibility * (within reasonable limits). * */ #ifndef _ALIGNBYTES #define _ALIGNBYTES 7 #endif #ifndef _ALIGN #define _ALIGN(p) (((u_long)(p) + _ALIGNBYTES) &~ _ALIGNBYTES) #endif #ifndef _ALIGNED_POINTER #define _ALIGNED_POINTER(p,t) ((((u_long)(p)) & (sizeof(t)-1)) == 0) #endif #ifndef _MACHINE #define _MACHINE alpha #endif #ifndef _MACHINE_ARCH #define _MACHINE_ARCH alpha #endif #ifndef _NO_NAMESPACE_POLLUTION #ifndef _MACHINE_PARAM_H_ #define _MACHINE_PARAM_H_ #ifndef MACHINE #define MACHINE "alpha" #endif #ifndef MACHINE_ARCH #define MACHINE_ARCH "alpha" #endif #define MID_MACHINE MID_ALPHA #include #include #define MAXSMPCPU 8 #ifdef SMP #define MAXCPU MAXSMPCPU #else #define MAXCPU 1 #endif #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) #define ALIGNED_POINTER(p,t) _ALIGNED_POINTER(p,t) #define PAGE_SIZE (1 << ALPHA_PGSHIFT) /* bytes/page */ #define PAGE_SHIFT ALPHA_PGSHIFT #define PAGE_MASK (PAGE_SIZE-1) #define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) #define KERNBASE 0xfffffc0000300000LL /* start of kernel virtual */ #define BTOPKERNBASE ((u_long)KERNBASE >> PGSHIFT) #define CLSIZE 1 #define CLSIZELOG2 0 /* NOTE: SSIZE, SINCR and UPAGES must be multiples of CLSIZE */ #define SSIZE 1 /* initial stack size/NBPG */ #define SINCR 1 /* increment of stack/NBPG */ -/* PREEMPTION exposes scheduler bugs that need to be fixed. */ -#if 0 -#define PREEMPTION -#endif - #ifndef KSTACK_PAGES #define KSTACK_PAGES 2 /* pages of kstack (with pcb) */ #endif #define KSTACK_GUARD_PAGES 1 /* pages of kstack guard; 0 disables */ #define UAREA_PAGES 1 /* pages of u-area */ /* * Mach derived conversion macros */ #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) #define trunc_page(x) ((unsigned long)(x) & ~(PAGE_MASK)) #define atop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define ptoa(x) ((unsigned long)(x) << PAGE_SHIFT) #define alpha_btop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define alpha_ptob(x) ((unsigned long)(x) << PAGE_SHIFT) #define pgtok(x) ((x) * (PAGE_SIZE / 1024)) #endif /* !_MACHINE_PARAM_H_ */ #endif /* !_NO_NAMESPACE_POLLUTION */ diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index aa8c29e95c1c..5216c55a28dc 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -1,170 +1,165 @@ /* * Copyright (c) 2002 David E. O'Brien. All rights reserved. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ /* * Machine dependent constants for AMD64. */ /* * Round p (pointer or byte index) up to a correctly-aligned value * for all data types (int, long, ...). The result is u_long and * must be cast to any desired pointer type. * * ALIGNED_POINTER is a boolean macro that checks whether an address * is valid to fetch data elements of type t from on this architecture. * This does not reflect the optimal alignment, just the possibility * (within reasonable limits). * */ #ifndef _ALIGNBYTES #define _ALIGNBYTES (sizeof(long) - 1) #endif #ifndef _ALIGN #define _ALIGN(p) (((u_long)(p) + _ALIGNBYTES) &~ _ALIGNBYTES) #endif #ifndef _ALIGNED_POINTER #define _ALIGNED_POINTER(p,t) ((((u_long)(p)) & (sizeof(t)-1)) == 0) #endif #ifndef _MACHINE #define _MACHINE amd64 #endif #ifndef _MACHINE_ARCH #define _MACHINE_ARCH amd64 #endif #ifndef _NO_NAMESPACE_POLLUTION #ifndef _MACHINE_PARAM_H_ #define _MACHINE_PARAM_H_ #ifndef MACHINE #define MACHINE "amd64" #endif #ifndef MACHINE_ARCH #define MACHINE_ARCH "amd64" #endif #ifdef SMP #define MAXCPU 8 #else #define MAXCPU 1 #endif #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) #define ALIGNED_POINTER(p,t) _ALIGNED_POINTER(p,t) /* Size of the level 1 page table units */ #define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NPTEPGSHIFT 9 /* LOG2(NPTEPG) */ #define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */ #define PAGE_SIZE (1<> PAGE_SHIFT) #define ptoa(x) ((unsigned long)(x) << PAGE_SHIFT) #define amd64_btop(x) ((unsigned long)(x) >> PAGE_SHIFT) #define amd64_ptob(x) ((unsigned long)(x) << PAGE_SHIFT) #define pgtok(x) ((unsigned long)(x) * (PAGE_SIZE / 1024)) #endif /* !_MACHINE_PARAM_H_ */ #endif /* !_NO_NAMESPACE_POLLUTION */ diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 60300f5ba927..43a870906202 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1,2495 +1,2500 @@ # $FreeBSD$ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers', # 'makeoptions', 'hints', etc. go into the kernel configuration that you # run config(8) with. # # Lines that begin with 'hint.' are NOT for config(8), they go into your # hints file. See /boot/device.hints and/or the 'hints' config(8) directive. # # Please use ``make LINT'' to create an old-style LINT file if you want to # do kernel test-builds. # # This file contains machine independent kernel configuration notes. For # machine dependent notes, look in /sys//conf/NOTES. # # # NOTES conventions and style guide: # # Large block comments should begin and end with a line containing only a # comment character. # # To describe a particular object, a block comment (if it exists) should # come first. Next should come device, options, and hints lines in that # order. All device and option lines must be described by a comment that # doesn't just expand the device or option name. Use only a concise # comment on the same line if possible. Very detailed descriptions of # devices and subsystems belong in man pages. # # A space followed by a tab separates 'options' from an option name. Two # spaces followed by a tab separate 'device' from a device name. Comments # after an option or device should use one space after the comment character. # To comment out a negative option that disables code and thus should not be # enabled for LINT builds, precede 'options' with "#!". # # # This is the ``identification'' of the kernel. Usually this should # be the same as the name of your kernel. # ident LINT # # The `maxusers' parameter controls the static sizing of a number of # internal system tables by a formula defined in subr_param.c. # Omitting this parameter or setting it to 0 will cause the system to # auto-size based on physical memory. # maxusers 10 # # The `makeoptions' parameter allows variables to be passed to the # generated Makefile in the build area. # # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS} # after most other flags. Here we use it to inhibit use of non-optimal # gcc builtin functions (e.g., memcmp). # # DEBUG happens to be magic. # The following is equivalent to 'config -g KERNELNAME' and creates # 'kernel.debug' compiled with -g debugging as well as a normal # 'kernel'. Use 'make install.debug' to install the debug kernel # but that isn't normally necessary as the debug symbols are not loaded # by the kernel and are not useful there anyway. # # KERNEL can be overridden so that you can change the default name of your # kernel. # # MODULES_OVERRIDE can be used to limit modules built to a specific list. # makeoptions CONF_CFLAGS=-fno-builtin #Don't allow use of memcmp, etc. #makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols #makeoptions KERNEL=foo #Build kernel "foo" and install "/foo" # Only build Linux API modules and plus those parts of the sound system I need. #makeoptions MODULES_OVERRIDE="linux sound/sound sound/driver/maestro3" makeoptions DESTDIR=/tmp # # Certain applications can grow to be larger than the 512M limit # that FreeBSD initially imposes. Below are some options to # allow that limit to grow to 1GB, and can be increased further # with changing the parameters. MAXDSIZ is the maximum that the # limit can be set to, and the DFLDSIZ is the default value for # the limit. MAXSSIZ is the maximum that the stack limit can be # set to. You might want to set the default lower than the max, # and explicitly set the maximum with a shell command for processes # that regularly exceed the limit like INND. # options MAXDSIZ=(1024UL*1024*1024) options MAXSSIZ=(128UL*1024*1024) options DFLDSIZ=(1024UL*1024*1024) # # BLKDEV_IOSIZE sets the default block size used in user block # device I/O. Note that this value will be overridden by the label # when specifying a block device from a label with a non-0 # partition blocksize. The default is PAGE_SIZE. # options BLKDEV_IOSIZE=8192 # Options for the VM subsystem # L2 cache size (in KB) can be specified in PQ_CACHESIZE options PQ_CACHESIZE=512 # color for 512k cache # Deprecated options supported for backwards compatibility #options PQ_NOOPT # No coloring #options PQ_LARGECACHE # color for 512k cache #options PQ_HUGECACHE # color for 1024k cache #options PQ_MEDIUMCACHE # color for 256k cache #options PQ_NORMALCACHE # color for 64k cache # This allows you to actually store this configuration file into # the kernel binary itself, where it may be later read by saying: # strings -n 3 /boot/kernel/kernel | sed -n 's/^___//p' > MYKERNEL # options INCLUDE_CONFIG_FILE # Include this file in kernel options GEOM_AES # Don't use, use GEOM_BDE options GEOM_APPLE # Apple partitioning options GEOM_BDE # Disk encryption. options GEOM_BSD # BSD disklabels options GEOM_CONCAT # Disk concatenation. options GEOM_FOX # Redundant path mitigation options GEOM_GATE # Userland services. options GEOM_GPT # GPT partitioning options GEOM_LABEL # Providers labelization. options GEOM_MBR # DOS/MBR partitioning options GEOM_MIRROR # Disk mirroring. options GEOM_NOP # Test class. options GEOM_PC98 # NEC PC9800 partitioning options GEOM_RAID3 # RAID3 functionality. options GEOM_STRIPE # Disk striping. options GEOM_SUNLABEL # Sun/Solaris partitioning options GEOM_UZIP # Read-only compressed disks options GEOM_VOL # Volume names from UFS superblock # # The root device and filesystem type can be compiled in; # this provides a fallback option if the root device cannot # be correctly guessed by the bootstrap code, or an override if # the RB_DFLTROOT flag (-r) is specified when booting the kernel. # options ROOTDEVNAME=\"ufs:da0s2e\" ##################################################################### # Scheduler options: # # Specifying one of SCHED_4BSD or SCHED_ULE is mandatory. These options # select which scheduler is compiled in. # # SCHED_4BSD is the historical, proven, BSD scheduler. It has a global run # queue and no cpu affinity which makes it suboptimal for SMP. It has very # good interactivity and priority selection. # # SCHED_ULE is a new scheduler that has been designed for SMP and has some # advantages for UP as well. It is intended to replace the 4BSD scheduler # over time. # options SCHED_4BSD #options SCHED_ULE ##################################################################### # SMP OPTIONS: # # SMP enables building of a Symmetric MultiProcessor Kernel. # Mandatory: options SMP # Symmetric MultiProcessor Kernel # ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin # if the thread that currently owns the mutex is executing on another # CPU. This behaviour is enabled by default, so this option can be used # to disable it. options NO_ADAPTIVE_MUTEXES # ADAPTIVE_GIANT causes the Giant lock to also be made adaptive when # running without NO_ADAPTIVE_MUTEXES. Normally, because Giant is assumed # to be held for extended periods, contention on Giant will cause a thread # to sleep rather than spinning. options ADAPTIVE_GIANT # MUTEX_NOINLINE forces mutex operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, MUTEX_PROFILING, # and WITNESS options. options MUTEX_NOINLINE # MUTEX_WAKE_ALL changes the mutex unlock algorithm to wake all waiters # when a contested mutex is released rather than just awaking the highest # priority waiter. options MUTEX_WAKE_ALL # SMP Debugging Options: # +# PREEMPTION allows the threads that are in the kernel to be preempted +# by higher priority threads. It helps with interactivity and +# allows interrupt threads to run sooner rather than waiting. +# WARNING! Only tested on alpha, amd64, and i386. # FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel # threads. It sole use is to expose race conditions and other # bugs during development. Enabling this option will reduce # performance and increase the frequency of kernel panics by # design. If you aren't sure that you need it then you don't. -# DON'T TURN THIS ON. +# Relies on the PREEMPTION option. DON'T TURN THIS ON. # MUTEX_DEBUG enables various extra assertions in the mutex code. # SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table # used to hold active sleep queues. # TURNSTILE_PROFILING enables rudimentary profiling of the hash table # used to hold active lock queues. # WITNESS enables the witness code which detects deadlocks and cycles # during locking operations. # WITNESS_KDB causes the witness code to drop into the kernel debugger if # a lock hierarchy violation occurs or if locks are held when going to # sleep. # WITNESS_SKIPSPIN disables the witness checks on spin mutexes. +options PREEMPTION options FULL_PREEMPTION options MUTEX_DEBUG options WITNESS options WITNESS_KDB options WITNESS_SKIPSPIN # MUTEX_PROFILING - Profiling mutual exclusion locks (mutexes). See # MUTEX_PROFILING(9) for details. options MUTEX_PROFILING # Set the number of buffers and the hash size. The hash size MUST be larger # than the number of buffers. Hash size should be prime. options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING ##################################################################### # COMPATIBILITY OPTIONS # # Implement system calls compatible with 4.3BSD and older versions of # FreeBSD. You probably do NOT want to remove this as much current code # still relies on the 4.3 emulation. Note that some architectures that # are supported by FreeBSD do not include support for certain important # aspects of this compatibility option, namely those related to the # signal delivery mechanism. # options COMPAT_43 # Enable FreeBSD4 compatibility syscalls options COMPAT_FREEBSD4 # # These three options provide support for System V Interface # Definition-style interprocess communication, in the form of shared # memory, semaphores, and message queues, respectively. # options SYSVSHM options SYSVSEM options SYSVMSG ##################################################################### # DEBUGGING OPTIONS # # Compile with kernel debugger related code. # options KDB # # Print a stack trace of the current thread on the console for a panic. # options KDB_TRACE # # Don't enter the debugger for a panic. Intended for unattended operation # where you may want to enter the debugger from the console, but still want # the machine to recover from a panic. # options KDB_UNATTENDED # # Enable the ddb debugger backend. # options DDB # # Print the numerical value of symbols in addition to the symbolic # representation. # options DDB_NUMSYM # # Enable the remote gdb debugger backend. # options GDB # # KTRACE enables the system-call tracing facility ktrace(2). To be more # SMP-friendly, KTRACE uses a worker thread to process most trace events # asynchronously to the thread generating the event. This requires a # pre-allocated store of objects representing trace events. The # KTRACE_REQUEST_POOL option specifies the initial size of this store. # The size of the pool can be adjusted both at boottime and runtime via # the kern.ktrace_request_pool tunable and sysctl. # options KTRACE #kernel tracing options KTRACE_REQUEST_POOL=101 # # KTR is a kernel tracing mechanism imported from BSD/OS. Currently it # has no userland interface aside from a few sysctl's. It is enabled with # the KTR option. KTR_ENTRIES defines the number of entries in the circular # trace buffer. KTR_COMPILE defines the mask of events to compile into the # kernel as defined by the KTR_* constants in . KTR_MASK defines the # initial value of the ktr_mask variable which determines at runtime what # events to trace. KTR_CPUMASK determines which CPU's log events, with # bit X corresponding to cpu X. KTR_VERBOSE enables dumping of KTR events # to the console by default. This functionality can be toggled via the # debug.ktr_verbose sysctl and defaults to off if KTR_VERBOSE is not defined. # options KTR options KTR_ENTRIES=1024 options KTR_COMPILE=(KTR_INTR|KTR_PROC) options KTR_MASK=KTR_INTR options KTR_CPUMASK=0x3 options KTR_VERBOSE # # The INVARIANTS option is used in a number of source files to enable # extra sanity checking of internal structures. This support is not # enabled by default because of the extra time it would take to check # for these conditions, which can only occur as a result of # programming errors. # options INVARIANTS # # The INVARIANT_SUPPORT option makes us compile in support for # verifying some of the internal structures. It is a prerequisite for # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be # called. The intent is that you can set 'INVARIANTS' for single # source files (by changing the source file or specifying it on the # command line) if you have 'INVARIANT_SUPPORT' enabled. Also, if you # wish to build a kernel module with 'INVARIANTS', then adding # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary # infrastructure without the added overhead. # options INVARIANT_SUPPORT # # The DIAGNOSTIC option is used to enable extra debugging information # from some parts of the kernel. As this makes everything more noisy, # it is disabled by default. # options DIAGNOSTIC # # REGRESSION causes optional kernel interfaces necessary only for regression # testing to be enabled. These interfaces may constitute security risks # when enabled, as they permit processes to easily modify aspects of the # run-time environment to reproduce unlikely or unusual (possibly normally # impossible) scenarios. # options REGRESSION # # RESTARTABLE_PANICS allows one to continue from a panic as if it were # a call to the debugger via the Debugger() function instead. It is only # useful if a kernel debugger is present. To restart from a panic, reset # the panicstr variable to NULL and continue execution. This option is # for development use only and should NOT be used in production systems # to "workaround" a panic. # #options RESTARTABLE_PANICS # # This option let some drivers co-exist that can't co-exist in a running # system. This is used to be able to compile all kernel code in one go for # quality assurance purposes (like this file, which the option takes it name # from.) # options COMPILING_LINT ##################################################################### # NETWORKING OPTIONS # # Protocol families: # Only the INET (Internet) family is officially supported in FreeBSD. # options INET #Internet communications protocols options INET6 #IPv6 communications protocols options IPSEC #IP security options IPSEC_ESP #IP security (crypto; define w/ IPSEC) options IPSEC_DEBUG #debug for IP security # # Set IPSEC_FILTERGIF to force packets coming through a gif tunnel # to be processed by any configured packet filtering (ipfw, ipf). # The default is that packets coming from a tunnel are _not_ processed; # they are assumed trusted. # # IPSEC history is preserved for such packets, and can be filtered # using ipfw(8)'s 'ipsec' keyword, when this option is enabled. # #options IPSEC_FILTERGIF #filter ipsec packets from a tunnel #options FAST_IPSEC #new IPsec (cannot define w/ IPSEC) options IPX #IPX/SPX communications protocols options IPXIP #IPX in IP encapsulation (not available) #options NCP #NetWare Core protocol options NETATALK #Appletalk communications protocols options NETATALKDEBUG #Appletalk debugging # # SMB/CIFS requester # NETSMB enables support for SMB protocol, it requires LIBMCHAIN and LIBICONV # options. # NETSMBCRYPTO enables support for encrypted passwords. options NETSMB #SMB/CIFS requester options NETSMBCRYPTO #encrypted password support for SMB # mchain library. It can be either loaded as KLD or compiled into kernel options LIBMCHAIN # altq(9). Enable the base part of the hooks with the ALTQ option. # Individual disciplines must be built into the base system and can not be # loaded as modules at this point. In order to build a SMP kernel you must # also have the ALTQ_NOPCC option. options ALTQ options ALTQ_CBQ # Class Bases Queueing options ALTQ_RED # Random Early Drop options ALTQ_RIO # RED In/Out options ALTQ_HFSC # Hierarchical Packet Scheduler options ALTQ_CDNR # Traffic conditioner options ALTQ_PRIQ # Priority Queueing options ALTQ_NOPCC # Required for SMP build options ALTQ_DEBUG # netgraph(4). Enable the base netgraph code with the NETGRAPH option. # Individual node types can be enabled with the corresponding option # listed below; however, this is not strictly necessary as netgraph # will automatically load the corresponding KLD module if the node type # is not already compiled into the kernel. Each type below has a # corresponding man page, e.g., ng_async(8). options NETGRAPH #netgraph(4) system options NETGRAPH_ASYNC options NETGRAPH_ATMLLC options NETGRAPH_ATM_ATMPIF options NETGRAPH_BLUETOOTH # ng_bluetooth(4) options NETGRAPH_BLUETOOTH_BT3C # ng_bt3c(4) options NETGRAPH_BLUETOOTH_H4 # ng_h4(4) options NETGRAPH_BLUETOOTH_HCI # ng_hci(4) options NETGRAPH_BLUETOOTH_L2CAP # ng_l2cap(4) options NETGRAPH_BLUETOOTH_SOCKET # ng_btsocket(4) options NETGRAPH_BLUETOOTH_UBT # ng_ubt(4) options NETGRAPH_BLUETOOTH_UBTBCMFW # ubtbcmfw(4) options NETGRAPH_BPF options NETGRAPH_BRIDGE options NETGRAPH_CISCO options NETGRAPH_DEVICE options NETGRAPH_ECHO options NETGRAPH_EIFACE options NETGRAPH_ETHER options NETGRAPH_FEC options NETGRAPH_FRAME_RELAY options NETGRAPH_GIF options NETGRAPH_GIF_DEMUX options NETGRAPH_HOLE options NETGRAPH_IFACE options NETGRAPH_IP_INPUT options NETGRAPH_KSOCKET options NETGRAPH_L2TP options NETGRAPH_LMI # MPPC compression requires proprietary files (not included) #options NETGRAPH_MPPC_COMPRESSION options NETGRAPH_MPPC_ENCRYPTION options NETGRAPH_ONE2MANY options NETGRAPH_PPP options NETGRAPH_PPPOE options NETGRAPH_PPTPGRE options NETGRAPH_RFC1490 options NETGRAPH_SOCKET options NETGRAPH_SPLIT options NETGRAPH_SPPP options NETGRAPH_TEE options NETGRAPH_TTY options NETGRAPH_UI options NETGRAPH_VJC # NgATM - Netgraph ATM options NGATM_ATM options NGATM_ATMBASE options NGATM_SSCOP options NGATM_SSCFU options NGATM_UNI options NGATM_CCATM device mn # Munich32x/Falc54 Nx64kbit/sec cards. device musycc # LMC/SBE LMC1504 quad T1/E1 # # Network interfaces: # The `loop' device is MANDATORY when networking is enabled. # The `ether' device provides generic code to handle # Ethernets; it is MANDATORY when an Ethernet device driver is # configured or token-ring is enabled. # The `wlan' device provides generic code to support 802.11 # drivers, including host AP mode; it is MANDATORY for the wi # driver and will eventually be required by all 802.11 drivers. # The `fddi' device provides generic code to support FDDI. # The `arcnet' device provides generic code to support Arcnet. # The `sppp' device serves a similar role for certain types # of synchronous PPP links (like `cx', `ar'). # The `sl' device implements the Serial Line IP (SLIP) service. # The `ppp' device implements the Point-to-Point Protocol. # The `bpf' device enables the Berkeley Packet Filter. Be # aware of the legal and administrative consequences of enabling this # option. The number of devices determines the maximum number of # simultaneous BPF clients programs runnable. # The `disc' device implements a minimal network interface, # which throws away all packets sent and never receives any. It is # included for testing purposes. This shows up as the `ds' interface. # The `tap' device is a pty-like virtual Ethernet interface # The `tun' device implements (user-)ppp and nos-tun # The `gif' device implements IPv6 over IP4 tunneling, # IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and # IPv6 over IPv6 tunneling. # The `gre' device implements two types of IP4 over IP4 tunneling: # GRE and MOBILE, as specified in the RFC1701 and RFC2004. # The XBONEHACK option allows the same pair of addresses to be configured on # multiple gif interfaces. # The `faith' device captures packets sent to it and diverts them # to the IPv4/IPv6 translation daemon. # The `stf' device implements 6to4 encapsulation. # The `ef' device provides support for multiple ethernet frame types # specified via ETHER_* options. See ef(4) for details. # # The pf packet filter consists of three devices: # The `pf' device provides /dev/pf and the firewall code itself. # The `pflog' device provides the pflog0 interface which logs packets. # The `pfsync' device provides the pfsync0 interface used for # synchronization of firewall state tables (over the net). # # The PPP_BSDCOMP option enables support for compress(1) style entire # packet compression, the PPP_DEFLATE is for zlib/gzip style compression. # PPP_FILTER enables code for filtering the ppp data stream and selecting # events for resetting the demand dial activity timer - requires bpf. # See pppd(8) for more details. # device ether #Generic Ethernet device vlan #VLAN support device wlan #802.11 support device token #Generic TokenRing device fddi #Generic FDDI device arcnet #Generic Arcnet device sppp #Generic Synchronous PPP device loop #Network loopback device device bpf #Berkeley packet filter device disc #Discard device (ds0, ds1, etc) device tap #Virtual Ethernet driver device tun #Tunnel driver (ppp(8), nos-tun(8)) device sl #Serial Line IP device gre #IP over IP tunneling device pf #PF OpenBSD packet-filter firewall device pflog #logging support interface for PF device pfsync #synchronization interface for PF device ppp #Point-to-point protocol options PPP_BSDCOMP #PPP BSD-compress support options PPP_DEFLATE #PPP zlib/deflate/gzip support options PPP_FILTER #enable bpf filtering (needs bpf) device ef # Multiple ethernet frames support options ETHER_II # enable Ethernet_II frame options ETHER_8023 # enable Ethernet_802.3 (Novell) frame options ETHER_8022 # enable Ethernet_802.2 frame options ETHER_SNAP # enable Ethernet_802.2/SNAP frame # for IPv6 device gif #IPv6 and IPv4 tunneling options XBONEHACK device faith #for IPv6 and IPv4 translation device stf #6to4 IPv6 over IPv4 encapsulation # # Internet family options: # # MROUTING enables the kernel multicast packet forwarder, which works # with mrouted(8). # # PIM enables Protocol Independent Multicast in the kernel. # Requires MROUTING enabled. # # IPFIREWALL enables support for IP firewall construction, in # conjunction with the `ipfw' program. IPFIREWALL_VERBOSE sends # logged packets to the system logger. IPFIREWALL_VERBOSE_LIMIT # limits the number of times a matching entry can be logged. # # WARNING: IPFIREWALL defaults to a policy of "deny ip from any to any" # and if you do not add other rules during startup to allow access, # YOU WILL LOCK YOURSELF OUT. It is suggested that you set firewall_type=open # in /etc/rc.conf when first enabling this feature, then refining the # firewall rules in /etc/rc.firewall after you've tested that the new kernel # feature works properly. # # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to # allow everything. Use with care, if a cracker can crash your # firewall machine, they can get to your protected machines. However, # if you are using it as an as-needed filter for specific problems as # they arise, then this may be for you. Changing the default to 'allow' # means that you won't get stuck if the kernel and /sbin/ipfw binary get # out of sync. # # IPDIVERT enables the divert IP sockets, used by ``ipfw divert'' # # IPFIREWALL_FORWARD enables changing of the packet destination either # to do some sort of policy routing or transparent proxying. Used by # ``ipfw forward''. # # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding # packets without touching the ttl). This can be useful to hide firewalls # from traceroute and similar tools. # # TCPDEBUG enables code which keeps traces of the TCP state machine # for sockets with the SO_DEBUG option set, which can then be examined # using the trpt(8) utility. # options MROUTING # Multicast routing options PIM # Protocol Independent Multicast options IPFIREWALL #firewall options IPFIREWALL_VERBOSE #enable logging to syslogd(8) options IPFIREWALL_VERBOSE_LIMIT=100 #limit verbosity options IPFIREWALL_DEFAULT_TO_ACCEPT #allow everything by default options IPFIREWALL_FORWARD #packet destination changes options IPV6FIREWALL #firewall for IPv6 options IPV6FIREWALL_VERBOSE options IPV6FIREWALL_VERBOSE_LIMIT=100 options IPV6FIREWALL_DEFAULT_TO_ACCEPT options IPDIVERT #divert sockets options IPFILTER #ipfilter support options IPFILTER_LOG #ipfilter logging options IPFILTER_DEFAULT_BLOCK #block all packets by default options IPSTEALTH #support for stealth forwarding options TCPDEBUG # The MBUF_STRESS_TEST option enables options which create # various random failures / extreme cases related to mbuf # functions. See mbuf(9) for a list of available test cases. options MBUF_STRESS_TEST # Statically Link in accept filters options ACCEPT_FILTER_DATA options ACCEPT_FILTER_HTTP # TCP_DROP_SYNFIN adds support for ignoring TCP packets with SYN+FIN. This # prevents nmap et al. from identifying the TCP/IP stack, but breaks support # for RFC1644 extensions and is not recommended for web servers. # options TCP_DROP_SYNFIN #drop TCP packets with SYN+FIN # TCP_SIGNATURE adds support for RFC 2385 (TCP-MD5) digests. These are # carried in TCP option 19. This option is commonly used to protect # TCP sessions (e.g. BGP) where IPSEC is not available nor desirable. # This is enabled on a per-socket basis using the TCP_MD5SIG socket option. # This requires the use of 'device crypto', 'options FAST_IPSEC', and # 'device cryptodev' as it depends on the non-KAME IPSEC SADB code. #options TCP_SIGNATURE #include support for RFC 2385 # DUMMYNET enables the "dummynet" bandwidth limiter. You need IPFIREWALL # as well. See dummynet(4) and ipfw(8) for more info. When you run # DUMMYNET it is advisable to also have "options HZ=1000" to achieve a # smoother scheduling of the traffic. # # BRIDGE enables bridging between ethernet cards -- see bridge(4). # You can use IPFIREWALL and DUMMYNET together with bridging. # options DUMMYNET options BRIDGE # Zero copy sockets support. This enables "zero copy" for sending and # receiving data via a socket. The send side works for any type of NIC, # the receive side only works for NICs that support MTUs greater than the # page size of your architecture and that support header splitting. See # zero_copy(9) for more details. options ZERO_COPY_SOCKETS # # ATM (HARP version) options # # ATM_CORE includes the base ATM functionality code. This must be included # for ATM support. # # ATM_IP includes support for running IP over ATM. # # At least one (and usually only one) of the following signalling managers # must be included (note that all signalling managers include PVC support): # ATM_SIGPVC includes support for the PVC-only signalling manager `sigpvc'. # ATM_SPANS includes support for the `spans' signalling manager, which runs # the FORE Systems's proprietary SPANS signalling protocol. # ATM_UNI includes support for the `uni30' and `uni31' signalling managers, # which run the ATM Forum UNI 3.x signalling protocols. # # The `hfa' driver provides support for the FORE Systems, Inc. # PCA-200E ATM PCI Adapter. # # The `harp' pseudo-driver makes all NATM interface drivers available to HARP. # options ATM_CORE #core ATM protocol family options ATM_IP #IP over ATM support options ATM_SIGPVC #SIGPVC signalling manager options ATM_SPANS #SPANS signalling manager options ATM_UNI #UNI signalling manager device hfa #FORE PCA-200E ATM PCI device harp #Pseudo-interface for NATM ##################################################################### # FILESYSTEM OPTIONS # # Only the root, /usr, and /tmp filesystems need be statically # compiled; everything else will be automatically loaded at mount # time. (Exception: the UFS family--- FFS --- cannot # currently be demand-loaded.) Some people still prefer to statically # compile other filesystems as well. # # NB: The NULL, PORTAL, UMAP and UNION filesystems are known to be # buggy, and WILL panic your system if you attempt to do anything with # them. They are included here as an incentive for some enterprising # soul to sit down and fix them. # # One of these is mandatory: options FFS #Fast filesystem options NFSCLIENT #Network File System client # The rest are optional: options CD9660 #ISO 9660 filesystem options FDESCFS #File descriptor filesystem options HPFS #OS/2 File system options MSDOSFS #MS DOS File System (FAT, FAT32) options NFSSERVER #Network File System server options NTFS #NT File System options NULLFS #NULL filesystem # Broken (depends on NCP): #options NWFS #NetWare filesystem options PORTALFS #Portal filesystem options PROCFS #Process filesystem (requires PSEUDOFS) options PSEUDOFS #Pseudo-filesystem framework options SMBFS #SMB/CIFS filesystem options UDF #Universal Disk Format # Broken (seriously (functionally) broken): #options UMAPFS #UID map filesystem options UNIONFS #Union filesystem # The xFS_ROOT options REQUIRE the associated ``options xFS'' options NFS_ROOT #NFS usable as root device # Soft updates is a technique for improving filesystem speed and # making abrupt shutdown less risky. # options SOFTUPDATES # Extended attributes allow additional data to be associated with files, # and is used for ACLs, Capabilities, and MAC labels. # See src/sys/ufs/ufs/README.extattr for more information. options UFS_EXTATTR options UFS_EXTATTR_AUTOSTART # Access Control List support for UFS filesystems. The current ACL # implementation requires extended attribute support, UFS_EXTATTR, # for the underlying filesystem. # See src/sys/ufs/ufs/README.acls for more information. options UFS_ACL # Directory hashing improves the speed of operations on very large # directories at the expense of some memory. options UFS_DIRHASH # Make space in the kernel for a root filesystem on a md device. # Define to the number of kilobytes to reserve for the filesystem. options MD_ROOT_SIZE=10 # Make the md device a potential root device, either with preloaded # images of type mfs_root or md_root. options MD_ROOT # Disk quotas are supported when this option is enabled. options QUOTA #enable disk quotas # If you are running a machine just as a fileserver for PC and MAC # users, using SAMBA or Netatalk, you may consider setting this option # and keeping all those users' directories on a filesystem that is # mounted with the suiddir option. This gives new files the same # ownership as the directory (similar to group). It's a security hole # if you let these users run programs, so confine it to file-servers # (but it'll save you lots of headaches in those cases). Root owned # directories are exempt and X bits are cleared. The suid bit must be # set on the directory as well; see chmod(1) PC owners can't see/set # ownerships so they keep getting their toes trodden on. This saves # you all the support calls as the filesystem it's used on will act as # they expect: "It's my dir so it must be my file". # options SUIDDIR # NFS options: options NFS_MINATTRTIMO=3 # VREG attrib cache timeout in sec options NFS_MAXATTRTIMO=60 options NFS_MINDIRATTRTIMO=30 # VDIR attrib cache timeout in sec options NFS_MAXDIRATTRTIMO=60 options NFS_GATHERDELAY=10 # Default write gather delay (msec) options NFS_WDELAYHASHSIZ=16 # and with this options NFS_DEBUG # Enable NFS Debugging # Coda stuff: options CODA #CODA filesystem. device vcoda #coda minicache <-> venus comm. # Use the old Coda 5.x venus<->kernel interface instead of the new # realms-aware 6.x protocol. #options CODA_COMPAT_5 # # Add support for the EXT2FS filesystem of Linux fame. Be a bit # careful with this - the ext2fs code has a tendency to lag behind # changes and not be exercised very much, so mounting read/write could # be dangerous (and even mounting read only could result in panics.) # options EXT2FS # Use real implementations of the aio_* system calls. There are numerous # stability and security issues in the current aio code that make it # unsuitable for inclusion on machines with untrusted local users. options VFS_AIO # Cryptographically secure random number generator; /dev/random device random # The system memory devices; /dev/mem, /dev/kmem device mem # Optional character code conversion support with LIBICONV. # Each option requires their base file system and LIBICONV. options CD9660_ICONV options MSDOSFS_ICONV options NTFS_ICONV options UDF_ICONV # Experimental support for large MS-DOS filesystems. # # WARNING: This uses at least 32 bytes of kernel memory (which is not # reclaimed until the FS is unmounted) for each file on disk to map # between the 32-bit inode numbers used by VFS and the 64-bit pseudo-inode # numbers used internally by msdosfs. This is only safe to use in certain # controlled situations (e.g. read-only FS with less than 1 million files). # Since the mappings do not persist across unmounts (or reboots), these # filesystems are not suitable for exporting through NFS, or any other # application that requires fixed inode numbers. options MSDOSFS_LARGE ##################################################################### # POSIX P1003.1B # Real time extensions added in the 1993 Posix # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING options _KPOSIX_PRIORITY_SCHEDULING # p1003_1b_semaphores are very experimental, # user should be ready to assist in debugging if problems arise. options P1003_1B_SEMAPHORES ##################################################################### # SECURITY POLICY PARAMETERS # Support for Mandatory Access Control (MAC): options MAC options MAC_BIBA options MAC_BSDEXTENDED options MAC_DEBUG options MAC_IFOFF options MAC_LOMAC options MAC_MLS options MAC_NONE options MAC_PARTITION options MAC_PORTACL options MAC_SEEOTHERUIDS options MAC_STUB options MAC_TEST ##################################################################### # CLOCK OPTIONS # The granularity of operation is controlled by the kernel option HZ whose # default value (100) means a granularity of 10ms (1s/HZ). # Some subsystems, such as DUMMYNET, might benefit from a smaller # granularity such as 1ms or less, for a smoother scheduling of packets. # Consider, however, that reducing the granularity too much might # cause excessive overhead in clock interrupt processing, # potentially causing ticks to be missed and thus actually reducing # the accuracy of operation. options HZ=100 # Enable support for the kernel PLL to use an external PPS signal, # under supervision of [x]ntpd(8) # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp options PPS_SYNC ##################################################################### # SCSI DEVICES # SCSI DEVICE CONFIGURATION # The SCSI subsystem consists of the `base' SCSI code, a number of # high-level SCSI device `type' drivers, and the low-level host-adapter # device drivers. The host adapters are listed in the ISA and PCI # device configuration sections below. # # It is possible to wire down your SCSI devices so that a given bus, # target, and LUN always come on line as the same device unit. In # earlier versions the unit numbers were assigned in the order that # the devices were probed on the SCSI bus. This means that if you # removed a disk drive, you may have had to rewrite your /etc/fstab # file, and also that you had to be careful when adding a new disk # as it may have been probed earlier and moved your device configuration # around. (See also option GEOM_VOL for a different solution to this # problem.) # This old behavior is maintained as the default behavior. The unit # assignment begins with the first non-wired down unit for a device # type. For example, if you wire a disk as "da3" then the first # non-wired disk will be assigned da4. # The syntax for wiring down devices is: hint.scbus.0.at="ahc0" hint.scbus.1.at="ahc1" hint.scbus.1.bus="0" hint.scbus.3.at="ahc2" hint.scbus.3.bus="0" hint.scbus.2.at="ahc2" hint.scbus.2.bus="1" hint.da.0.at="scbus0" hint.da.0.target="0" hint.da.0.unit="0" hint.da.1.at="scbus3" hint.da.1.target="1" hint.da.2.at="scbus2" hint.da.2.target="3" hint.sa.1.at="scbus1" hint.sa.1.target="6" # "units" (SCSI logical unit number) that are not specified are # treated as if specified as LUN 0. # All SCSI devices allocate as many units as are required. # The ch driver drives SCSI Media Changer ("jukebox") devices. # # The da driver drives SCSI Direct Access ("disk") and Optical Media # ("WORM") devices. # # The sa driver drives SCSI Sequential Access ("tape") devices. # # The cd driver drives SCSI Read Only Direct Access ("cd") devices. # # The ses driver drives SCSI Environment Services ("ses") and # SAF-TE ("SCSI Accessible Fault-Tolerant Enclosure") devices. # # The pt driver drives SCSI Processor devices. # # # Target Mode support is provided here but also requires that a SIM # (SCSI Host Adapter Driver) provide support as well. # # The targ driver provides target mode support as a Processor type device. # It exists to give the minimal context necessary to respond to Inquiry # commands. There is a sample user application that shows how the rest # of the command support might be done in /usr/share/examples/scsi_target. # # The targbh driver provides target mode support and exists to respond # to incoming commands that do not otherwise have a logical unit assigned # to them. # # The "unknown" device (uk? in pre-2.0.5) is now part of the base SCSI # configuration as the "pass" driver. device scbus #base SCSI code device ch #SCSI media changers device da #SCSI direct access devices (aka disks) device sa #SCSI tapes device cd #SCSI CD-ROMs device ses #SCSI Environmental Services (and SAF-TE) device pt #SCSI processor device targ #SCSI Target Mode Code device targbh #SCSI Target Mode Blackhole Device device pass #CAM passthrough driver # CAM OPTIONS: # debugging options: # -- NOTE -- If you specify one of the bus/target/lun options, you must # specify them all! # CAMDEBUG: When defined enables debugging macros # CAM_DEBUG_BUS: Debug the given bus. Use -1 to debug all busses. # CAM_DEBUG_TARGET: Debug the given target. Use -1 to debug all targets. # CAM_DEBUG_LUN: Debug the given lun. Use -1 to debug all luns. # CAM_DEBUG_FLAGS: OR together CAM_DEBUG_INFO, CAM_DEBUG_TRACE, # CAM_DEBUG_SUBTRACE, and CAM_DEBUG_CDB # # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds # CAM_NEW_TRAN_CODE: this is the new transport layer code that will be switched # to soon # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter) # queue after a bus reset, and the number of milliseconds to # freeze the device queue after a bus device reset. This # can be changed at boot and runtime with the # kern.cam.scsi_delay tunable/sysctl. options CAMDEBUG options CAM_DEBUG_BUS=-1 options CAM_DEBUG_TARGET=-1 options CAM_DEBUG_LUN=-1 options CAM_DEBUG_FLAGS=(CAM_DEBUG_INFO|CAM_DEBUG_TRACE|CAM_DEBUG_CDB) options CAM_MAX_HIGHPOWER=4 options SCSI_NO_SENSE_STRINGS options SCSI_NO_OP_STRINGS options SCSI_DELAY=8000 # Be pessimistic about Joe SCSI device # Options for the CAM CDROM driver: # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only # enforced if there is I/O waiting for another LUN # The compiled in defaults for these variables are 2 and 10 seconds, # respectively. # # These can also be changed on the fly with the following sysctl variables: # kern.cam.cd.changer.min_busy_seconds # kern.cam.cd.changer.max_busy_seconds # options CHANGER_MIN_BUSY_SECONDS=2 options CHANGER_MAX_BUSY_SECONDS=10 # Options for the CAM sequential access driver: # SA_IO_TIMEOUT: Timeout for read/write/wfm operations, in minutes # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT. options SA_IO_TIMEOUT=4 options SA_SPACE_TIMEOUT=60 options SA_REWIND_TIMEOUT=(2*60) options SA_ERASE_TIMEOUT=(4*60) options SA_1FM_AT_EOD # Optional timeout for the CAM processor target (pt) device # This is specified in seconds. The default is 60 seconds. options SCSI_PT_DEFAULT_TIMEOUT=60 # Optional enable of doing SES passthrough on other devices (e.g., disks) # # Normally disabled because a lot of newer SCSI disks report themselves # as having SES capabilities, but this can then clot up attempts to build # build a topology with the SES device that's on the box these drives # are in.... options SES_ENABLE_PASSTHROUGH ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS # The `pty' device usually turns out to be ``effectively mandatory'', # as it is required for `telnetd', `rlogind', `screen', `emacs', and # `xterm', among others. device pty #Pseudo ttys device nmdm #back-to-back tty devices device md #Memory/malloc disk device snp #Snoop device - to look at pty/vty/etc.. device ccd #Concatenated disk driver # Configuring Vinum into the kernel is not necessary, since the kld # module gets started automatically when vinum(8) starts. This # device is also untested. Use at your own risk. # # The option VINUMDEBUG must match the value set in CFLAGS # in src/sbin/vinum/Makefile. Failure to do so will result in # the following message from vinum(8): # # Can't get vinum config: Invalid argument # # see vinum(4) for more reasons not to use these options. device vinum #Vinum concat/mirror/raid driver options VINUMDEBUG #enable Vinum debugging hooks # Kernel side iconv library options LIBICONV # Size of the kernel message buffer. Should be N * pagesize. options MSGBUF_SIZE=40960 # Maximum size of a tty or pty input buffer. options TTYHOG=8193 ##################################################################### # HARDWARE DEVICE CONFIGURATION # For ISA the required hints are listed. # EISA, MCA, PCI and pccard are self identifying buses, so no hints # are needed. # # Mandatory devices: # # The keyboard controller; it controls the keyboard and the PS/2 mouse. device atkbdc hint.atkbdc.0.at="isa" hint.atkbdc.0.port="0x060" # The AT keyboard device atkbd hint.atkbd.0.at="atkbdc" hint.atkbd.0.irq="1" # Options for atkbd: options ATKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions ATKBD_DFLT_KEYMAP=jp.106 # These options are valid for other keyboard drivers as well. options KBD_DISABLE_KEYMAP_LOAD # refuse to load a keymap options KBD_INSTALL_CDEV # install a CDEV entry in /dev # `flags' for atkbd: # 0x01 Force detection of keyboard, else we always assume a keyboard # 0x02 Don't reset keyboard, useful for some newer ThinkPads # 0x03 Force detection and avoid reset, might help with certain # dockingstations # 0x04 Old-style (XT) keyboard support, useful for older ThinkPads # PS/2 mouse device psm hint.psm.0.at="atkbdc" hint.psm.0.irq="12" # Options for psm: options PSM_HOOKRESUME #hook the system resume event, useful #for some laptops options PSM_RESETAFTERSUSPEND #reset the device at the resume event # Video card driver for VGA adapters. device vga hint.vga.0.at="isa" # Options for vga: # Try the following option if the mouse pointer is not drawn correctly # or font does not seem to be loaded properly. May cause flicker on # some systems. options VGA_ALT_SEQACCESS # If you can dispense with some vga driver features, you may want to # use the following options to save some memory. #options VGA_NO_FONT_LOADING # don't save/load font #options VGA_NO_MODE_CHANGE # don't change video modes # Older video cards may require this option for proper operation. options VGA_SLOW_IOACCESS # do byte-wide i/o's to TS and GDC regs # The following option probably won't work with the LCD displays. options VGA_WIDTH90 # support 90 column modes options FB_DEBUG # Frame buffer debugging device splash # Splash screen and screen saver support # Various screen savers. device blank_saver device daemon_saver device fade_saver device fire_saver device green_saver device logo_saver device rain_saver device star_saver device warp_saver # The syscons console driver (sco color console compatible). device sc hint.sc.0.at="isa" options MAXCONS=16 # number of virtual consoles options SC_ALT_MOUSE_IMAGE # simplified mouse cursor in text mode options SC_DFLT_FONT # compile font in makeoptions SC_DFLT_FONT=cp850 options SC_DISABLE_KDBKEY # disable `debug' key options SC_DISABLE_REBOOT # disable reboot key sequence options SC_HISTORY_SIZE=200 # number of history buffer lines options SC_MOUSE_CHAR=0x3 # char code for text mode mouse cursor options SC_PIXEL_MODE # add support for the raster text mode # The following options will let you change the default colors of syscons. options SC_NORM_ATTR=(FG_GREEN|BG_BLACK) options SC_NORM_REV_ATTR=(FG_YELLOW|BG_GREEN) options SC_KERNEL_CONS_ATTR=(FG_RED|BG_BLACK) options SC_KERNEL_CONS_REV_ATTR=(FG_BLACK|BG_RED) # The following options will let you change the default behaviour of # cut-n-paste feature options SC_CUT_SPACES2TABS # convert leading spaces into tabs options SC_CUT_SEPCHARS=\"x09\" # set of characters that delimit words # (default is single space - \"x20\") # If you have a two button mouse, you may want to add the following option # to use the right button of the mouse to paste text. options SC_TWOBUTTON_MOUSE # You can selectively disable features in syscons. options SC_NO_CUTPASTE options SC_NO_FONT_LOADING options SC_NO_HISTORY options SC_NO_SYSMOUSE options SC_NO_SUSPEND_VTYSWITCH # `flags' for sc # 0x80 Put the video card in the VESA 800x600 dots, 16 color mode # 0x100 Probe for a keyboard device periodically if one is not present # # Optional devices: # # # SCSI host adapters: # # adv: All Narrow SCSI bus AdvanSys controllers. # adw: Second Generation AdvanSys controllers including the ADV940UW. # aha: Adaptec 154x/1535/1640 # ahb: Adaptec 174x EISA controllers # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/ # 19160x/29160x, aic7770/aic78xx # ahd: Adaptec 29320/39320 Controllers. # aic: Adaptec 6260/6360, APA-1460 (PC Card), NEC PC9801-100 (C-BUS) # amd: Support for the AMD 53C974 SCSI host adapter chip as found on devices # such as the Tekram DC-390(T). # bt: Most Buslogic controllers: including BT-445, BT-54x, BT-64x, BT-74x, # BT-75x, BT-946, BT-948, BT-956, BT-958, SDC3211B, SDC3211F, SDC3222F # esp: NCR53c9x. Only for SBUS hardware right now. # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters, # ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2, # ISP 12160 Ultra3 SCSI, # Qlogic ISP 2100 and ISP 2200 1Gb Fibre Channel host adapters. # Qlogic ISP 2300 and ISP 2312 2Gb Fibre Channel host adapters. # ispfw: Firmware module for Qlogic host adapters # mpt: LSI-Logic MPT/Fusion 53c1020 or 53c1030 Ultra4 # or FC9x9 Fibre Channel host adapters. # ncr: NCR 53C810, 53C825 self-contained SCSI host adapters. # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors: # 53C810, 53C810A, 53C815, 53C825, 53C825A, 53C860, 53C875, # 53C876, 53C885, 53C895, 53C895A, 53C896, 53C897, 53C1510D, # 53C1010-33, 53C1010-66. # trm: Tekram DC395U/UW/F DC315U adapters. # wds: WD7000 # # Note that the order is important in order for Buslogic ISA/EISA cards to be # probed correctly. # device bt hint.bt.0.at="isa" hint.bt.0.port="0x330" device adv hint.adv.0.at="isa" device adw device aha hint.aha.0.at="isa" device aic hint.aic.0.at="isa" device ahb device ahc device ahd device amd device esp device isp hint.isp.0.disable="1" hint.isp.0.role="3" hint.isp.0.prefer_iomap="1" hint.isp.0.prefer_memmap="1" hint.isp.0.fwload_disable="1" hint.isp.0.ignore_nvram="1" hint.isp.0.fullduplex="1" hint.isp.0.topology="lport" hint.isp.0.topology="nport" hint.isp.0.topology="lport-only" hint.isp.0.topology="nport-only" # we can't get u_int64_t types, nor can we get strings if it's got # a leading 0x, hence this silly dodge. hint.isp.0.portwnn="w50000000aaaa0000" hint.isp.0.nodewnn="w50000000aaaa0001" device ispfw device mpt device ncr device sym device trm device wds hint.wds.0.at="isa" hint.wds.0.port="0x350" hint.wds.0.irq="11" hint.wds.0.drq="6" # The aic7xxx driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. Unfortunately, # this doesn't work on some motherboards, which prevents it from being the # default. options AHC_ALLOW_MEMIO # Dump the contents of the ahc controller configuration PROM. options AHC_DUMP_EEPROM # Bitmap of units to enable targetmode operations. options AHC_TMODE_ENABLE # Compile in Aic7xxx Debugging code. options AHC_DEBUG # Aic7xxx driver debugging options. See sys/dev/aic7xxx/aic7xxx.h options AHC_DEBUG_OPTS # Print register bitfields in debug output. Adds ~128k to driver # See ahc(4). options AHC_REG_PRETTY_PRINT # Compile in aic79xx debugging code. options AHD_DEBUG # Aic79xx driver debugging options. Adds ~215k to driver. See ahd(4). options AHD_DEBUG_OPTS=0xFFFFFFFF # Print human-readable register definitions when debugging options AHD_REG_PRETTY_PRINT # Bitmap of units to enable targetmode operations. options AHD_TMODE_ENABLE # The adw driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. options ADW_ALLOW_MEMIO # Options used in dev/isp/ (Qlogic SCSI/FC driver). # # ISP_TARGET_MODE - enable target mode operation # options ISP_TARGET_MODE=1 # Options used in dev/sym/ (Symbios SCSI driver). #options SYM_SETUP_LP_PROBE_MAP #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d #options SYM_SETUP_SCSI_DIFF #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 #options SYM_SETUP_PCI_PARITY #-PCI parity checking # disabled:0, enabled:1 (default) #options SYM_SETUP_MAX_LUN #-Number of LUNs supported # default:8, range:[1..64] # The 'asr' driver provides support for current DPT/Adaptec SCSI RAID # controllers (SmartRAID V and VI and later). # These controllers require the CAM infrastructure. # device asr # The 'dpt' driver provides support for old DPT controllers (http://www.dpt.com/). # These have hardware RAID-{0,1,5} support, and do multi-initiator I/O. # The DPT controllers are commonly re-licensed under other brand-names - # some controllers by Olivetti, Dec, HP, AT&T, SNI, AST, Alphatronic, NEC and # Compaq are actually DPT controllers. # # See src/sys/dev/dpt for debugging and other subtle options. # DPT_MEASURE_PERFORMANCE Enables a set of (semi)invasive metrics. Various # instruments are enabled. The tools in # /usr/sbin/dpt_* assume these to be enabled. # DPT_HANDLE_TIMEOUTS Normally device timeouts are handled by the DPT. # If you ant the driver to handle timeouts, enable # this option. If your system is very busy, this # option will create more trouble than solve. # DPT_TIMEOUT_FACTOR Used to compute the excessive amount of time to # wait when timing out with the above option. # DPT_DEBUG_xxxx These are controllable from sys/dev/dpt/dpt.h # DPT_LOST_IRQ When enabled, will try, once per second, to catch # any interrupt that got lost. Seems to help in some # DPT-firmware/Motherboard combinations. Minimal # cost, great benefit. # DPT_RESET_HBA Make "reset" actually reset the controller # instead of fudging it. Only enable this if you # are 100% certain you need it. device dpt # DPT options #!CAM# options DPT_MEASURE_PERFORMANCE #!CAM# options DPT_HANDLE_TIMEOUTS options DPT_TIMEOUT_FACTOR=4 options DPT_LOST_IRQ options DPT_RESET_HBA # # Compaq "CISS" RAID controllers (SmartRAID 5* series) # These controllers have a SCSI-like interface, and require the # CAM infrastructure. # device ciss # # Intel Integrated RAID controllers. # This driver was developed and is maintained by Intel. Contacts # at Intel for this driver are # "Kannanthanam, Boji T" and # "Leubner, Achim" . # device iir # # Mylex AcceleRAID and eXtremeRAID controllers with v6 and later # firmware. These controllers have a SCSI-like interface, and require # the CAM infrastructure. # device mly # # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers. Only # one entry is needed; the code will find and configure all supported # controllers. # device ida # Compaq Smart RAID device mlx # Mylex DAC960 device amr # AMI MegaRAID # # 3ware ATA RAID # device twe # 3ware ATA RAID # # The 'ATA' driver supports all ATA and ATAPI devices, including PC Card # devices. You only need one "device ata" for it to find all # PCI and PC Card ATA/ATAPI devices on modern machines. device ata device atadisk # ATA disk drives device ataraid # ATA RAID drives device atapicd # ATAPI CDROM drives device atapifd # ATAPI floppy drives device atapist # ATAPI tape drives device atapicam # emulate ATAPI devices as SCSI ditto via CAM # needs CAM to be present (scbus & pass) # # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add: hint.ata.0.at="isa" hint.ata.0.port="0x1f0" hint.ata.0.irq="14" hint.ata.1.at="isa" hint.ata.1.port="0x170" hint.ata.1.irq="15" # # The following options are valid on the ATA driver: # # ATA_STATIC_ID: controller numbering is static ie depends on location # else the device numbers are dynamically allocated. options ATA_STATIC_ID # # Standard floppy disk controllers and floppy tapes, supports # the Y-E DATA External FDD (PC Card) # device fdc hint.fdc.0.at="isa" hint.fdc.0.port="0x3F0" hint.fdc.0.irq="6" hint.fdc.0.drq="2" # # FDC_DEBUG enables floppy debugging. Since the debug output is huge, you # gotta turn it actually on by setting the variable fd_debug with DDB, # however. options FDC_DEBUG # # Activate this line if you happen to have an Insight floppy tape. # Probing them proved to be dangerous for people with floppy disks only, # so it's "hidden" behind a flag: #hint.fdc.0.flags="1" # Specify floppy devices hint.fd.0.at="fdc0" hint.fd.0.drive="0" hint.fd.1.at="fdc0" hint.fd.1.drive="1" # # sio: serial ports (see sio(4)), including support for various # PC Card devices, such as Modem and NICs (see etc/defaults/pccard.conf) # device sio hint.sio.0.at="isa" hint.sio.0.port="0x3F8" hint.sio.0.flags="0x10" hint.sio.0.irq="4" # Options for sio: options COM_ESP # Code for Hayes ESP. options COM_MULTIPORT # Code for some cards with shared IRQs. options CONSPEED=115200 # Speed for serial console # (default 9600). # `flags' specific to sio(4). See below for flags used by both sio(4) and # uart(4). # 0x20 force this unit to be the console (unless there is another # higher priority console). This replaces the COMCONSOLE option. # 0x40 reserve this unit for low level console operations. Do not # access the device in any normal way. # PnP `flags' # 0x1 disable probing of this device. Used to prevent your modem # from being attached as a PnP modem. # Other flags for sio that aren't documented in the man page. # 0x20000 enable hardware RTS/CTS and larger FIFOs. Only works for # ST16650A-compatible UARTs. # # uart: newbusified driver for serial interfaces. It consolidates the sio(4), # sab(4) and zs(4) drivers. # device uart # Options for uart(4) options UART_PPS_ON_CTS # Do time pulse capturing using CTS # instead of DCD. # The following hint should only be used for pure ISA devices. It is not # needed otherwise. Use of hints is strongly discouraged. hint.uart.0.at="isa" # The following 3 hints are used when the UART is a system device (i.e., a # console or debug port), but only on platforms that don't have any other # means to pass the information to the kernel. The unit number of the hint # is only used to bundle the hints together. There is no relation to the # unit number of the probed UART. hint.uart.0.port="0x3f8" hint.uart.0.flags="0x10" hint.uart.0.baud="115200" # `flags' for serial drivers that support consoles like sio(4) and uart(4): # 0x10 enable console support for this unit. Other console flags # (if applicable) are ignored unless this is set. Enabling # console support does not make the unit the preferred console. # Boot with -h or set boot_serial=YES in the loader. For sio(4) # specifically, the 0x20 flag can also be set (see above). # Currently, at most one unit can have console support; the # first one (in config file order) with this flag set is # preferred. Setting this flag for sio0 gives the old behaviour. # 0x80 use this port for serial line gdb support in ddb. Also known # as debug port. # # Options for serial drivers that support consoles: options BREAK_TO_DEBUGGER # A BREAK on a serial console goes to # ddb, if available. # Solaris implements a new BREAK which is initiated by a character # sequence CR ~ ^b which is similar to a familiar pattern used on # Sun servers by the Remote Console. options ALT_BREAK_TO_DEBUGGER # PCI Universal Communications driver # Supports various single and multi port PCI serial cards. Maybe later # also the parallel ports on combination serial/parallel cards. New cards # can be added in src/sys/dev/puc/pucdata.c. # # If the PUC_FASTINTR option is used the driver will try to use fast # interrupts. The card must then be the only user of that interrupt. # Interrupts cannot be shared when using PUC_FASTINTR. device puc options PUC_FASTINTR # # Network interfaces: # # MII bus support is required for some PCI 10/100 ethernet NICs, # namely those which use MII-compliant transceivers or implement # transceiver control interfaces that operate like an MII. Adding # "device miibus0" to the kernel config pulls in support for # the generic miibus API and all of the PHY drivers, including a # generic one for PHYs that aren't specifically handled by an # individual driver. device miibus # an: Aironet 4500/4800 802.11 wireless adapters. Supports the PCMCIA, # PCI and ISA varieties. # awi: Support for IEEE 802.11 PC Card devices using the AMD Am79C930 and # Harris (Intersil) Chipset with PCnetMobile firmware by AMD. # bge: Support for gigabit ethernet adapters based on the Broadcom # BCM570x family of controllers, including the 3Com 3c996-T, # the Netgear GA302T, the SysKonnect SK-9D21 and SK-9D41, and # the embedded gigE NICs on Dell PowerEdge 2550 servers. # cm: Arcnet SMC COM90c26 / SMC COM90c56 # (and SMC COM90c66 in '56 compatibility mode) adapters. # cnw: Xircom CNW/Netware Airsurfer PC Card adapter # cs: IBM Etherjet and other Crystal Semi CS89x0-based adapters # dc: Support for PCI fast ethernet adapters based on the DEC/Intel 21143 # and various workalikes including: # the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics # AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On # 82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II # and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver # replaces the old al, ax, dm, pn and mx drivers. List of brands: # Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110, # SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX, # LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204, # KNE110TX. # de: Digital Equipment DC21040 # em: Intel Pro/1000 Gigabit Ethernet 82542, 82543, 82544 based adapters. # ep: 3Com 3C509, 3C529, 3C556, 3C562D, 3C563D, 3C572, 3C574X, 3C579, 3C589 # and PC Card devices using these chipsets. # ex: Intel EtherExpress Pro/10 and other i82595-based adapters, # Olicom Ethernet PC Card devices. # fe: Fujitsu MB86960A/MB86965A Ethernet # fea: DEC DEFEA EISA FDDI adapter # fpa: Support for the Digital DEFPA PCI FDDI. `device fddi' is also needed. # fxp: Intel EtherExpress Pro/100B # (hint of prefer_iomap can be done to prefer I/O instead of Mem mapping) # gx: Intel Pro/1000 Gigabit Ethernet (82542, 82543-F, 82543-T) # hme: Sun HME (Happy Meal Ethernet) # lge: Support for PCI gigabit ethernet adapters based on the Level 1 # LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX, # SMC TigerCard 1000 (SMC9462SX), and some Addtron cards. # my: Myson Fast Ethernet (MTD80X, MTD89X) # nge: Support for PCI gigabit ethernet adapters based on the National # Semiconductor DP83820 and DP83821 chipset. This includes the # SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet # GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the LinkSys # EG1032 and EG1064, the Surecom EP-320G-TX and the Netgear GA622T. # pcn: Support for PCI fast ethernet adapters based on the AMD Am79c97x # chipsets, including the PCnet/FAST, PCnet/FAST+, PCnet/PRO and # PCnet/Home. These were previously handled by the lnc driver (and # still will be if you leave this driver out of the kernel). # rl: Support for PCI fast ethernet adapters based on the RealTek 8129/8139 # chipset. Note that the RealTek driver defaults to using programmed # I/O to do register accesses because memory mapped mode seems to cause # severe lockups on SMP hardware. This driver also supports the # Accton EN1207D `Cheetah' adapter, which uses a chip called # the MPX 5030/5038, which is either a RealTek in disguise or a # RealTek workalike. Note that the D-Link DFE-530TX+ uses the RealTek # chipset and is supported by this driver, not the 'vr' driver. # sf: Support for Adaptec Duralink PCI fast ethernet adapters based on the # Adaptec AIC-6915 "starfire" controller. # This includes dual and quad port cards, as well as one 100baseFX card. # Most of these are 64-bit PCI devices, except for one single port # card which is 32-bit. # sis: Support for NICs based on the Silicon Integrated Systems SiS 900, # SiS 7016 and NS DP83815 PCI fast ethernet controller chips. # sbsh: Support for Granch SBNI16 SHDSL modem PCI adapters # sk: Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs. # This includes the SK-9841 and SK-9842 single port cards (single mode # and multimode fiber) and the SK-9843 and SK-9844 dual port cards # (also single mode and multimode). # The driver will autodetect the number of ports on the card and # attach each one as a separate network interface. # sn: Support for ISA and PC Card Ethernet devices using the # SMC91C90/92/94/95 chips. # ste: Sundance Technologies ST201 PCI fast ethernet controller, includes # the D-Link DFE-550TX. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the # 3Com 3c985, the Netgear GA620 and various others. Note that you will # probably want to bump up NMBCLUSTERS a lot to use this driver. # tl: Support for the Texas Instruments TNETE100 series 'ThunderLAN' # cards and integrated ethernet controllers. This includes several # Compaq Netelligent 10/100 cards and the built-in ethernet controllers # in several Compaq Prosignia, Proliant and Deskpro systems. It also # supports several Olicom 10Mbps and 10/100 boards. # tx: SMC 9432 TX, BTX and FTX cards. (SMC EtherPower II series) # txp: Support for 3Com 3cR990 cards with the "Typhoon" chipset # vr: Support for various fast ethernet adapters based on the VIA # Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips, # including the D-Link DFE530TX (see 'rl' for DFE530TX+), the Hawking # Technologies PN102TX, and the AOpen/Acer ALN-320. # vx: 3Com 3C590 and 3C595 # wb: Support for fast ethernet adapters based on the Winbond W89C840F chip. # Note: this is not the same as the Winbond W89C940F, which is a # NE2000 clone. # wi: Lucent WaveLAN/IEEE 802.11 PCMCIA adapters. Note: this supports both # the PCMCIA and ISA cards: the ISA card is really a PCMCIA to ISA # bridge with a PCMCIA adapter plugged into it. # xe: Xircom/Intel EtherExpress Pro100/16 PC Card ethernet controller, # Accton Fast EtherCard-16, Compaq Netelligent 10/100 PC Card, # Toshiba 10/100 Ethernet PC Card, Xircom 16-bit Ethernet + Modem 56 # xl: Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast) # Etherlink XL cards and integrated controllers. This includes the # integrated 3c905B-TX chips in certain Dell Optiplex and Dell # Precision desktop machines and the integrated 3c905-TX chips # in Dell Latitude laptop docking stations. # Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX # Order for ISA/EISA devices is important here device cm hint.cm.0.at="isa" hint.cm.0.port="0x2e0" hint.cm.0.irq="9" hint.cm.0.maddr="0xdc000" device cs hint.cs.0.at="isa" hint.cs.0.port="0x300" device ep device ex device fe hint.fe.0.at="isa" hint.fe.0.port="0x300" device fea device sn hint.sn.0.at="isa" hint.sn.0.port="0x300" hint.sn.0.irq="10" device an device awi device cnw device wi device xe # PCI Ethernet NICs that use the common MII bus controller code. device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) hint.fxp.0.prefer_iomap="0" device hme # Sun HME (Happy Meal Ethernet) device my # Myson Fast Ethernet (MTD80X, MTD89X) device rl # RealTek 8129/8139 device pcn # AMD Am79C97x PCI 10/100 NICs device sf # Adaptec AIC-6915 (``Starfire'') device sbsh # Granch SBNI16 SHDSL modem device sis # Silicon Integrated Systems SiS 900/SiS 7016 device ste # Sundance ST201 (D-Link DFE-550TX) device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device txp # 3Com 3cR990 (``Typhoon'') device vx # 3Com 3c590, 3c595 (``Vortex'') # PCI Gigabit & FDDI NICs. device bge device gx device lge device nge device sk device ti device fpa # Use "private" jumbo buffers allocated exclusively for the ti(4) driver. # This option is incompatible with the TI_JUMBO_HDRSPLIT option below. #options TI_PRIVATE_JUMBOS # Turn on the header splitting option for the ti(4) driver firmware. This # only works for Tigon II chips, and has no effect for Tigon I chips. options TI_JUMBO_HDRSPLIT # These two options allow manipulating the mbuf cluster size and mbuf size, # respectively. Be very careful with NIC driver modules when changing # these from their default values, because that can potentially cause a # mismatch between the mbuf size assumed by the kernel and the mbuf size # assumed by a module. The only driver that currently has the ability to # detect a mismatch is ti(4). options MCLSHIFT=12 # mbuf cluster shift in bits, 12 == 4KB options MSIZE=512 # mbuf size in bytes # # ATM related options (Cranor version) # (note: this driver cannot be used with the HARP ATM stack) # # The `en' device provides support for Efficient Networks (ENI) # ENI-155 PCI midway cards, and the Adaptec 155Mbps PCI ATM cards (ANA-59x0). # # The `hatm' device provides support for Fore/Marconi HE155 and HE622 # ATM PCI cards. # # The `fatm' device provides support for Fore PCA200E ATM PCI cards. # # The `patm' device provides support for IDT77252 based cards like # ProSum's ProATM-155 and ProATM-25 and IDT's evaluation boards. # # atm device provides generic atm functions and is required for # atm devices. # NATM enables the netnatm protocol family that can be used to # bypass TCP/IP. # # utopia provides the access to the ATM PHY chips and is required for en, # hatm and fatm. # # the current driver supports only PVC operations (no atm-arp, no multicast). # for more details, please read the original documents at # http://www.ccrc.wustl.edu/pub/chuck/tech/bsdatm/bsdatm.html # device atm device en device fatm #Fore PCA200E device hatm #Fore/Marconi HE155/622 device patm #IDT77252 cards (ProATM and IDT) device utopia #ATM PHY driver options NATM #native ATM options LIBMBPOOL #needed by patm, iatm # # Sound drivers # # sound: The generic sound driver. # device sound # # snd_*: Device-specific drivers. # # The flags of the device tells the device a bit more info about the # device that normally is obtained through the PnP interface. # bit 2..0 secondary DMA channel; # bit 4 set if the board uses two dma channels; # bit 15..8 board type, overrides autodetection; leave it # zero if don't know what to put in (and you don't, # since this is unsupported at the moment...). # # snd_als4000: Avance Logic ALS4000 PCI. # snd_ad1816: Analog Devices AD1816 ISA PnP/non-PnP. # snd_cmi: CMedia CMI8338/CMI8738 PCI. # snd_cs4281: Crystal Semiconductor CS4281 PCI. # snd_csa: Crystal Semiconductor CS461x/428x PCI. (except # 4281) # snd_ds1: Yamaha DS-1 PCI. # snd_emu10k1: Creative EMU10K1 PCI and EMU10K2 (Audigy) PCI. # snd_es137x: Ensoniq AudioPCI ES137x PCI. # snd_ess: Ensoniq ESS ISA PnP/non-PnP. # snd_fm801: Forte Media FM801 PCI. # snd_gusc: Gravis UltraSound ISA PnP/non-PnP. # snd_ich: Intel ICH PCI and some more audio controllers # embedded in a chipset. # snd_maestro: ESS Technology Maestro-1/2x PCI. # snd_maestro3: ESS Technology Maestro-3/Allegro PCI. # snd_mss: Microsoft Sound System ISA PnP/non-PnP. # snd_neomagic: Neomagic 256 AV/ZX PCI. # snd_sb16: Creative SoundBlaster16, to be used in # conjuction with snd_sbc. # snd_sb8: Creative SoundBlaster (pre-16), to be used in # conjuction with snd_sbc. # snd_sbc: Creative SoundBlaster ISA PnP/non-PnP. # Supports ESS and Avance ISA chips as well. # snd_solo: ESS Solo-1x PCI. # snd_t4dwave: Trident 4DWave PCI, Sis 7018 PCI and Acer Labs # M5451 PCI. # snd_via8233: VIA VT8233x PCI. # snd_via82c686: VIA VT82C686A PCI. # snd_vibes: S3 Sonicvibes PCI. # snd_uaudio: USB audio. device snd_ad1816 device snd_als4000 #device snd_au88x0 device snd_cmi device snd_cs4281 device snd_csa device snd_ds1 device snd_emu10k1 device snd_es137x device snd_ess device snd_fm801 device snd_gusc device snd_ich device snd_maestro device snd_maestro3 device snd_mss device snd_neomagic device snd_sb16 device snd_sb8 device snd_sbc device snd_solo device snd_t4dwave device snd_via8233 device snd_via82c686 device snd_vibes #device snd_vortex1 device snd_uaudio # For non-pnp sound cards: hint.snd_mss.0.at="isa" hint.snd_mss.0.irq="10" hint.snd_mss.0.drq="1" hint.snd_mss.0.flags="0x0" hint.snd_sbc.0.at="isa" hint.snd_sbc.0.port="0x220" hint.snd_sbc.0.irq="5" hint.snd_sbc.0.drq="1" hint.snd_sbc.0.flags="0x15" hint.snd_gusc.0.at="isa" hint.snd_gusc.0.port="0x220" hint.snd_gusc.0.irq="5" hint.snd_gusc.0.drq="1" hint.snd_gusc.0.flags="0x13" # # Miscellaneous hardware: # # scd: Sony CD-ROM using proprietary (non-ATAPI) interface # mcd: Mitsumi CD-ROM using proprietary (non-ATAPI) interface # bktr: Brooktree bt848/848a/849a/878/879 video capture and TV Tuner board # cy: Cyclades serial driver # joy: joystick (including IO DATA PCJOY PC Card joystick) # rc: RISCom/8 multiport card # rp: Comtrol Rocketport(ISA/PCI) - single card # si: Specialix SI/XIO 4-32 port terminal multiplexor # nmdm: nullmodem terminal driver (see nmdm(4)) # Notes on the Comtrol Rocketport driver: # # The exact values used for rp0 depend on how many boards you have # in the system. The manufacturer's sample configs are listed as: # # device rp # core driver support # # Comtrol Rocketport ISA single card # hint.rp.0.at="isa" # hint.rp.0.port="0x280" # # If instead you have two ISA cards, one installed at 0x100 and the # second installed at 0x180, then you should add the following to # your kernel probe hints: # hint.rp.0.at="isa" # hint.rp.0.port="0x100" # hint.rp.1.at="isa" # hint.rp.1.port="0x180" # # For 4 ISA cards, it might be something like this: # hint.rp.0.at="isa" # hint.rp.0.port="0x180" # hint.rp.1.at="isa" # hint.rp.1.port="0x100" # hint.rp.2.at="isa" # hint.rp.2.port="0x340" # hint.rp.3.at="isa" # hint.rp.3.port="0x240" # # For PCI cards, you need no hints. # Mitsumi CD-ROM device mcd hint.mcd.0.at="isa" hint.mcd.0.port="0x300" # for the Sony CDU31/33A CDROM device scd hint.scd.0.at="isa" hint.scd.0.port="0x230" device joy # PnP aware, hints for nonpnp only hint.joy.0.at="isa" hint.joy.0.port="0x201" device rc hint.rc.0.at="isa" hint.rc.0.port="0x220" hint.rc.0.irq="12" device rp hint.rp.0.at="isa" hint.rp.0.port="0x280" device si options SI_DEBUG hint.si.0.at="isa" hint.si.0.maddr="0xd0000" hint.si.0.irq="12" device nmdm # # The 'bktr' device is a PCI video capture device using the Brooktree # bt848/bt848a/bt849a/bt878/bt879 chipset. When used with a TV Tuner it forms a # TV card, e.g. Miro PC/TV, Hauppauge WinCast/TV WinTV, VideoLogic Captivator, # Intel Smart Video III, AverMedia, IMS Turbo, FlyVideo. # # options OVERRIDE_CARD=xxx # options OVERRIDE_TUNER=xxx # options OVERRIDE_MSP=1 # options OVERRIDE_DBX=1 # These options can be used to override the auto detection # The current values for xxx are found in src/sys/dev/bktr/bktr_card.h # Using sysctl(8) run-time overrides on a per-card basis can be made # # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_PAL # or # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_NTSC # Specifies the default video capture mode. # This is required for Dual Crystal (28&35Mhz) boards where PAL is used # to prevent hangs during initialisation, e.g. VideoLogic Captivator PCI. # # options BKTR_USE_PLL # This is required for PAL or SECAM boards with a 28Mhz crystal and no 35Mhz # crystal, e.g. some new Bt878 cards. # # options BKTR_GPIO_ACCESS # This enable IOCTLs which give user level access to the GPIO port. # # options BKTR_NO_MSP_RESET # Prevents the MSP34xx reset. Good if you initialise the MSP in another OS first # # options BKTR_430_FX_MODE # Switch Bt878/879 cards into Intel 430FX chipset compatibility mode. # # options BKTR_SIS_VIA_MODE # Switch Bt878/879 cards into SIS/VIA chipset compatibility mode which is # needed for some old SiS and VIA chipset motherboards. # This also allows Bt878/879 chips to work on old OPTi (<1997) chipset # motherboards and motherboards with bad or incomplete PCI 2.1 support. # As a rough guess, old = before 1998 # # options BKTR_NEW_MSP34XX_DRIVER # Use new, more complete initialization scheme for the msp34* soundchip. # Should fix stereo autodetection if the old driver does only output # mono sound. # # options BKTR_USE_FREEBSD_SMBUS # Compile with FreeBSD SMBus implementation # # Brooktree driver has been ported to the new I2C framework. Thus, # you'll need to have the following 3 lines in the kernel config. # device smbus # device iicbus # device iicbb # device iicsmb # The iic and smb devices are only needed if you want to control other # I2C slaves connected to the external connector of some cards. # device bktr # # PC Card/PCMCIA # (OLDCARD) # # card: pccard slots # pcic: isa/pccard bridge #device pcic #hint.pcic.0.at="isa" #hint.pcic.1.at="isa" #device card 1 # # PC Card/PCMCIA and Cardbus # (NEWCARD) # # Note that NEWCARD and OLDCARD are incompatible. Do not use both at the same # time. # # pccbb: pci/cardbus bridge implementing YENTA interface # pccard: pccard slots # cardbus: cardbus slots device cbb device pccard device cardbus # # SMB bus # # System Management Bus support is provided by the 'smbus' device. # Access to the SMBus device is via the 'smb' device (/dev/smb*), # which is a child of the 'smbus' device. # # Supported devices: # smb standard io through /dev/smb* # # Supported SMB interfaces: # iicsmb I2C to SMB bridge with any iicbus interface # bktr brooktree848 I2C hardware interface # intpm Intel PIIX4 (82371AB, 82443MX) Power Management Unit # alpm Acer Aladdin-IV/V/Pro2 Power Management Unit # ichsmb Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA) # viapm VIA VT82C586B/596B/686A and VT8233 Power Management Unit # amdpm AMD 756 Power Management Unit # nfpm NVIDIA nForce Power Management Unit # device smbus # Bus support, required for smb below. device intpm device alpm device ichsmb device viapm device amdpm device nfpm device smb # # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported devices: # ic i2c network interface # iic i2c standard io # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands. # # Supported interfaces: # bktr brooktree848 I2C software interface # # Other: # iicbb generic I2C bit-banging code (needed by lpbb, bktr) # device iicbus # Bus support, required for ic/iic/iicsmb below. device iicbb device ic device iic device iicsmb # smb over i2c bridge # Parallel-Port Bus # # Parallel port bus support is provided by the `ppbus' device. # Multiple devices may be attached to the parallel port, devices # are automatically probed and attached when found. # # Supported devices: # vpo Iomega Zip Drive # Requires SCSI disk support ('scbus' and 'da'), best # performance is achieved with ports in EPP 1.9 mode. # lpt Parallel Printer # plip Parallel network interface # ppi General-purpose I/O ("Geek Port") + IEEE1284 I/O # pps Pulse per second Timing Interface # lpbb Philips official parallel port I2C bit-banging interface # # Supported interfaces: # ppc ISA-bus parallel port interfaces. # options PPC_PROBE_CHIPSET # Enable chipset specific detection # (see flags in ppc(4)) options DEBUG_1284 # IEEE1284 signaling protocol debug options PERIPH_1284 # Makes your computer act as an IEEE1284 # compliant peripheral options DONTPROBE_1284 # Avoid boot detection of PnP parallel devices options VP0_DEBUG # ZIP/ZIP+ debug options LPT_DEBUG # Printer driver debug options PPC_DEBUG # Parallel chipset level debug options PLIP_DEBUG # Parallel network IP interface debug options PCFCLOCK_VERBOSE # Verbose pcfclock driver options PCFCLOCK_MAX_RETRIES=5 # Maximum read tries (default 10) device ppc hint.ppc.0.at="isa" hint.ppc.0.irq="7" device ppbus device vpo device lpt device plip device ppi device pps device lpbb device pcfclock # Kernel BOOTP support options BOOTP # Use BOOTP to obtain IP address/hostname # Requires NFSCLIENT and NFS_ROOT options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_NFSV3 # Use NFS v3 to NFS mount root options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP # # Add software watchdog routines. # options SW_WATCHDOG # # Disable swapping of upages and stack pages. This option removes all # code which actually performs swapping, so it's not possible to turn # it back on at run-time. # # This is sometimes usable for systems which don't have any swap space # (see also sysctls "vm.defer_swapspace_pageouts" and # "vm.disable_swapspace_pageouts") # #options NO_SWAPPING # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers # for sendfile(2) that are used to map file VM pages, and normally # default to a quantity that is roughly 16*MAXUSERS+512. You would # typically want about 4 of these for each simultaneous file send. # options NSFBUFS=1024 # # Enable extra debugging code for locks. This stores the filename and # line of whatever acquired the lock in the lock itself, and change a # number of function calls to pass around the relevant data. This is # not at all useful unless you are debugging lock code. Also note # that it is likely to break e.g. fstat(1) unless you recompile your # userland with -DDEBUG_LOCKS as well. # options DEBUG_LOCKS ##################################################################### # USB support # UHCI controller device uhci # OHCI controller device ohci # EHCI controller device ehci # General USB code (mandatory for USB) device usb # # USB Double Bulk Pipe devices device udbp # USB Fm Radio device ufm # Generic USB device driver device ugen # Human Interface Device (anything with buttons and dials) device uhid # USB keyboard device ukbd # USB printer device ulpt # USB Iomega Zip 100 Drive (Requires scbus and da) device umass # USB support for Belkin F5U109 and Magic Control Technology serial adapters device umct # USB modem support device umodem # USB mouse device ums # Diamond Rio 500 Mp3 player device urio # USB scanners device uscanner # # USB serial support device ucom # USB support for Belkin F5U103 and compatible serial adapters device ubsa # USB support for BWCT console serial adapters device ubser # USB support for serial adapters based on the FT8U100AX and FT8U232AM device uftdi # USB support for Prolific PL-2303 serial adapters device uplcom # USB Visor and Palm devices device uvisor # USB serial support for DDI pocket's PHS device uvscom # # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus # eval board. device aue # ASIX Electronics AX88172 USB 2.0 ethernet driver. Used in the # LinkSys USB200M and various other adapters. device axe # # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate # and Netmate II, and the Belkin F5U111. device cue # # Kawasaki LSI ethernet. Supports the LinkSys USB10T, # Entrega USB-NET-E45, Peracom Ethernet Adapter, the # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T, # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB # and 2104USB, and the Corega USB-T. device kue # # RealTek RTL8150 USB to fast ethernet. Supports the Melco LUA-KTX # and the GREEN HOUSE GH-USB100B. device rue # # Davicom DM9601E USB to fast ethernet. Supports the Corega FEther USB-TXC. device udav # debugging options for the USB subsystem # options USB_DEBUG # options for ukbd: options UKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions UKBD_DFLT_KEYMAP=it.iso # options for uplcom: options UPLCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds # options for uvscom: options UVSCOM_DEFAULT_OPKTSIZE=8 # default output packet size options UVSCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds ##################################################################### # FireWire support device firewire # FireWire bus code device sbp # SCSI over Firewire (Requires scbus and da) device sbp_targ # SBP-2 Target mode (Requires scbus and targ) device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (rfc2734 and rfc3146) ##################################################################### # dcons support (Dumb Console Device) device dcons # dumb console driver device dcons_crom # FireWire attachment options DCONS_BUF_SIZE=16384 # buffer size options DCONS_POLL_HZ=100 # polling rate options DCONS_FORCE_CONSOLE=0 # force to be the primary console options DCONS_FORCE_GDB=1 # force to be the gdb device ##################################################################### # crypto subsystem # # This is a port of the openbsd crypto framework. Include this when # configuring FAST_IPSEC and when you have a h/w crypto device to accelerate # user applications that link to openssl. # # Drivers are ports from openbsd with some simple enhancements that have # been fed back to openbsd. device crypto # core crypto support device cryptodev # /dev/crypto for access to h/w device rndtest # FIPS 140-2 entropy tester device hifn # Hifn 7951, 7781, etc. options HIFN_DEBUG # enable debugging support: hw.hifn.debug options HIFN_RNDTEST # enable rndtest support device ubsec # Broadcom 5501, 5601, 58xx options UBSEC_DEBUG # enable debugging support: hw.ubsec.debug options UBSEC_RNDTEST # enable rndtest support ##################################################################### # # Embedded system options: # # An embedded system might want to run something other than init. options INIT_PATH=/sbin/init:/stand/sysinstall # Debug options options BUS_DEBUG # enable newbus debugging options DEBUG_VFS_LOCKS # enable vfs lock debugging options SOCKBUF_DEBUG # enable sockbuf last record/mb tail checking ##################################################################### # SYSV IPC KERNEL PARAMETERS # # Maximum number of entries in a semaphore map. options SEMMAP=31 # Maximum number of System V semaphores that can be used on the system at # one time. options SEMMNI=11 # Total number of semaphores system wide options SEMMNS=61 # Total number of undo structures in system options SEMMNU=31 # Maximum number of System V semaphores that can be used by a single process # at one time. options SEMMSL=61 # Maximum number of operations that can be outstanding on a single System V # semaphore at one time. options SEMOPM=101 # Maximum number of undo operations that can be outstanding on a single # System V semaphore at one time. options SEMUME=11 # Maximum number of shared memory pages system wide. options SHMALL=1025 # Maximum size, in bytes, of a single System V shared memory region. options SHMMAX=(SHMMAXPGS*PAGE_SIZE+1) options SHMMAXPGS=1025 # Minimum size, in bytes, of a single System V shared memory region. options SHMMIN=2 # Maximum number of shared memory regions that can be used on the system # at one time. options SHMMNI=33 # Maximum number of System V shared memory regions that can be attached to # a single process at one time. options SHMSEG=9 # Set the amount of time (in seconds) the system will wait before # rebooting automatically when a kernel panic occurs. If set to (-1), # the system will wait indefinitely until a key is pressed on the # console. options PANIC_REBOOT_WAIT_TIME=16 # Attempt to bypass the buffer cache and put data directly into the # userland buffer for read operation when O_DIRECT flag is set on the # file. Both offset and length of the read operation must be # multiples of the physical media sector size. # #options DIRECTIO # Specify a lower limit for the number of swap I/O buffers. They are # (among other things) used when bypassing the buffer cache due to # DIRECTIO kernel option enabled and O_DIRECT flag set on file. # #options NSWBUF_MIN=120 ##################################################################### # More undocumented options for linting. # Note that documenting these are not considered an affront. options CAM_DEBUG_DELAY # VFS cluster debugging. options CLUSTERDEBUG options DEBUG # Kernel filelock debugging. options LOCKF_DEBUG # System V compatible message queues # Please note that the values provided here are used to test kernel # building. The defaults in the sources provide almost the same numbers. # MSGSSZ must be a power of 2 between 8 and 1024. options MSGMNB=2049 # Max number of chars in queue options MSGMNI=41 # Max number of message queue identifiers options MSGSEG=2049 # Max number of message segments options MSGSSZ=16 # Size of a message segment options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers options NMBCLUSTERS=1024 # Number of mbuf clusters options SCSI_NCR_DEBUG options SCSI_NCR_MAX_SYNC=10000 options SCSI_NCR_MAX_WIDE=1 options SCSI_NCR_MYADDR=7 options SC_DEBUG_LEVEL=5 # Syscons debug level options SC_RENDER_DEBUG # syscons rendering debugging options SHOW_BUSYBUFS # List buffers that prevent root unmount options SLIP_IFF_OPTS options VFS_BIO_DEBUG # VFS buffer I/O debugging options KSTACK_MAX_PAGES=32 # Maximum pages to give the kernel stack # Adaptec Array Controller driver options options AAC_DEBUG # Debugging levels: # 0 - quiet, only emit warnings # 1 - noisy, emit major function # points and things done # 2 - extremely noisy, emit trace # items in loops, etc. # Yet more undocumented options for linting. # BKTR_ALLOC_PAGES has no effect except to cause warnings, and # BROOKTREE_ALLOC_PAGES hasn't actually been superseded by it, since the # driver still mostly spells this option BROOKTREE_ALLOC_PAGES. ##options BKTR_ALLOC_PAGES=(217*4+1) options BROOKTREE_ALLOC_PAGES=(217*4+1) options MAXFILES=999 options NDEVFSINO=1025 options NDEVFSOVERFLOW=32769 # Yet more undocumented options for linting. options VGA_DEBUG diff --git a/sys/conf/options b/sys/conf/options index eee12c97b2ea..f7ac6d25b5a2 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,689 +1,690 @@ # $FreeBSD$ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affect only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h ADW_ALLOW_MEMIO opt_adw.h TWA_DEBUG opt_twa.h TWA_FLASH_FIRMWARE opt_twa.h # Debugging options. DDB DDB_NUMSYM opt_ddb.h GDB GDBSPEED opt_gdb.h KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h # Miscellaneous options. ADAPTIVE_GIANT opt_adaptive_mutexes.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h ALQ CODA_COMPAT_5 opt_coda.h COMPAT_43 opt_compat.h COMPAT_FREEBSD4 opt_compat.h COMPILING_LINT opt_global.h CONSPEED opt_comconsole.h CY_PCI_FASTINTR DIRECTIO opt_directio.h -FULL_PREEMPTION +FULL_PREEMPTION opt_sched.h +PREEMPTION opt_sched.h GEOM_AES opt_geom.h GEOM_APPLE opt_geom.h GEOM_BDE opt_geom.h GEOM_BSD opt_geom.h GEOM_CONCAT opt_geom.h GEOM_FOX opt_geom.h GEOM_GATE opt_geom.h GEOM_GPT opt_geom.h GEOM_LABEL opt_geom.h GEOM_MBR opt_geom.h GEOM_MIRROR opt_geom.h GEOM_NOP opt_geom.h GEOM_PC98 opt_geom.h GEOM_RAID3 opt_geom.h GEOM_STRIPE opt_geom.h GEOM_SUNLABEL opt_geom.h GEOM_UZIP opt_geom.h GEOM_VOL opt_geom.h KSTACK_MAX_PAGES KSTACK_PAGES KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC MAC_ALWAYS_LABEL_MBUF opt_mac.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_DEBUG opt_mac.h MAC_IFOFF opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_SIZE opt_md.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h MUTEX_WAKE_ALL NSWBUF_MIN opt_swap.h PANIC_REBOOT_WAIT_TIME opt_panic.h PPS_SYNC opt_ntp.h PUC_FASTINTR opt_puc.h QUOTA SCHED_4BSD opt_sched.h SCHED_ULE opt_sched.h SHOW_BUSYBUFS SLEEPQUEUE_PROFILING SPX_HACK SUIDDIR opt_suiddir.h MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMAP opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TURNSTILE_PROFILING TTYHOG opt_tty.h VFS_AIO WLCACHE opt_wavelan.h WLDEBUG opt_wavelan.h # POSIX kernel options P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. CD9660 opt_dontuse.h CODA opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h HPFS opt_dontuse.h LINPROCFS opt_dontuse.h MSDOSFS opt_dontuse.h NTFS opt_dontuse.h NULLFS opt_dontuse.h NWFS opt_dontuse.h PORTALFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h UDF opt_dontuse.h UMAPFS opt_dontuse.h UNIONFS opt_dontuse.h # Broken - ffs_snapshot() dependency from ufs_lookup() :-( FFS opt_ffs_broken_fixme.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. NFSCLIENT opt_nfs.h NFSSERVER opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h NTFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h NETSMBCRYPTO opt_netsmb.h # Experimental support for large MS-DOS filesystems; SEE WARNING IN "NOTES"! MSDOSFS_LARGE opt_msdosfs.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NMBCLUSTERS opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_NEW_TRAN_CODE opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_LP_PROBE_MAP opt_sym.h #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in pci/ncr.c SCSI_NCR_DEBUG opt_ncr.h SCSI_NCR_MAX_SYNC opt_ncr.h SCSI_NCR_MAX_WIDE opt_ncr.h SCSI_NCR_MYADDR opt_ncr.h # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h # Options used in the 'ata' ATA/ATAPI driver ATA_STATIC_ID opt_ata.h ATA_NOPCI opt_ata.h DEV_ATADISK opt_ata.h DEV_ATAPICD opt_ata.h DEV_ATAPIST opt_ata.h DEV_ATAPIFD opt_ata.h DEV_ATAPICAM opt_ata.h DEV_ATARAID opt_ata.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_DEBUG opt_altq.h BOOTP opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h BRIDGE opt_bdg.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h ETHER_II opt_ef.h ETHER_8023 opt_ef.h ETHER_8022 opt_ef.h ETHER_SNAP opt_ef.h MROUTING opt_mrouting.h PIM opt_mrouting.h INET opt_inet.h INET6 opt_inet6.h IPSEC opt_ipsec.h IPSEC_ESP opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_FILTERGIF opt_ipsec.h FAST_IPSEC opt_ipsec.h IPDIVERT DUMMYNET opt_ipdn.h IPFILTER opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_FORWARD opt_ipfw.h IPV6FIREWALL opt_ip6fw.h IPV6FIREWALL_VERBOSE opt_ip6fw.h IPV6FIREWALL_VERBOSE_LIMIT opt_ip6fw.h IPV6FIREWALL_DEFAULT_TO_ACCEPT opt_ip6fw.h IPSTEALTH IPX IPXIP opt_ipx.h LIBMBPOOL LIBMCHAIN MBUF_STRESS_TEST opt_mbuf_stress_test.h NCP NETATALK opt_atalk.h NET_WITH_GIANT opt_net.h PPP_BSDCOMP opt_ppp.h PPP_DEFLATE opt_ppp.h PPP_FILTER opt_ppp.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_SIGNATURE opt_inet.h TCP_SACK_DEBUG opt_tcp_sack.h TCP_DROP_SYNFIN opt_tcp_input.h XBONEHACK # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_ATMLLC opt_netgraph.h NETGRAPH_ATM_ATMPIF opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h # MPPC compression requires proprietary files (not included) NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h NGATM_ATMBASE opt_netgraph.h NGATM_SSCOP opt_netgraph.h NGATM_SSCFU opt_netgraph.h NGATM_UNI opt_netgraph.h NGATM_CCATM opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h ZERO_COPY_SOCKETS opt_zero.h TI_PRIVATE_JUMBOS opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # ATM (HARP version) ATM_CORE opt_atm.h ATM_IP opt_atm.h ATM_SIGPVC opt_atm.h ATM_SPANS opt_atm.h ATM_UNI opt_atm.h # XXX Conflict: # of devices vs network protocol (Native ATM). # This makes "atm.h" unusable. NATM # DPT driver debug flags DPT_MEASURE_PERFORMANCE opt_dpt.h DPT_HANDLE_TIMEOUTS opt_dpt.h DPT_TIMEOUT_FACTOR opt_dpt.h DPT_LOST_IRQ opt_dpt.h DPT_RESET_HBA opt_dpt.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. CLUSTERDEBUG opt_debug_cluster.h DEBUG_1284 opt_ppb_1284.h VP0_DEBUG opt_vpo.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h NPX_DEBUG opt_debug_npx.h NETATALKDEBUG opt_atalk.h SI_DEBUG opt_debug_si.h SX_DEBUG opt_debug_sx.h # Fb options FB_DEBUG opt_fb.h FB_INSTALL_CDEV opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h LOOKUP_SHARED opt_global.h MCLSHIFT opt_global.h MUTEX_DEBUG opt_global.h MUTEX_NOINLINE opt_global.h MUTEX_PROFILING opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RESTARTABLE_PANICS opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h PQ_NOOPT opt_vmpage.h PQ_NORMALCACHE opt_vmpage.h PQ_MEDIUMCACHE opt_vmpage.h PQ_LARGECACHE opt_vmpage.h PQ_HUGECACHE opt_vmpage.h PQ_CACHESIZE opt_vmpage.h # Standard SMP options SMP opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_GATHERDELAY opt_nfs.h NFS_WDELAYHASHSIZ opt_nfs.h NFS_DEBUG opt_nfs.h # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver OVERRIDE_CARD opt_bktr.h OVERRIDE_TUNER opt_bktr.h OVERRIDE_DBX opt_bktr.h OVERRIDE_MSP opt_bktr.h BROOKTREE_SYSTEM_DEFAULT opt_bktr.h BROOKTREE_ALLOC_PAGES opt_bktr.h BKTR_OVERRIDE_CARD opt_bktr.h BKTR_OVERRIDE_TUNER opt_bktr.h BKTR_OVERRIDE_DBX opt_bktr.h BKTR_OVERRIDE_MSP opt_bktr.h BKTR_SYSTEM_DEFAULT opt_bktr.h BKTR_ALLOC_PAGES opt_bktr.h BKTR_USE_PLL opt_bktr.h BKTR_GPIO_ACCESS opt_bktr.h BKTR_NO_MSP_RESET opt_bktr.h BKTR_430_FX_MODE opt_bktr.h BKTR_SIS_VIA_MODE opt_bktr.h BKTR_USE_FREEBSD_SMBUS opt_bktr.h BKTR_NEW_MSP34XX_DRIVER opt_bktr.h # options for serial support COM_ESP opt_sio.h COM_MULTIPORT opt_sio.h BREAK_TO_DEBUGGER opt_comconsole.h ALT_BREAK_TO_DEBUGGER opt_comconsole.h # Options to support PPS UART_PPS_ON_CTS opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USBVERBOSE opt_usb.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # Vinum options VINUMDEBUG opt_vinum.h # Embedded system options INIT_PATH opt_init_path.h ROOTDEVNAME opt_rootdevname.h FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h TDFX_LINUX opt_tdfx.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_SKIPSPIN opt_witness.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_THREADS opt_acpi.h ACPI_NO_SEMAPHORES opt_acpi.h ACPICA_PEDANTIC opt_acpi.h # options for DEVFS, see sys/fs/devfs/devfs.h NDEVFSINO opt_devfs.h NDEVFSOVERFLOW opt_devfs.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_ISA opt_isa.h DEV_MCA opt_mca.h DEV_SPLASH opt_splash.h EISA_SLOTS opt_eisa.h # ed driver ED_NO_MIIBUS opt_ed.h # wi driver WI_SYMBOL_FIRMWARE opt_wi.h # XXX bogusly global. DEVICE_POLLING opt_global.h SOCKBUF_DEBUG opt_global.h # options for ubsec driver UBSEC_DEBUG opt_ubsec.h UBSEC_RNDTEST opt_ubsec.h UBSEC_NO_RNG opt_ubsec.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h # options for the Atheros HAL (only useful with source code) AH_SUPPORT_AR5210 opt_ah.h AH_SUPPORT_AR5211 opt_ah.h AH_SUPPORT_AR5212 opt_ah.h AH_DEBUG opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_ASSERT opt_ah.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # Static unit counts NI4BTRC opt_i4b.h NI4BRBCH opt_i4b.h NI4BTEL opt_i4b.h NI4BIPR opt_i4b.h NI4BING opt_i4b.h NI4BISPPP opt_i4b.h diff --git a/sys/i386/include/param.h b/sys/i386/include/param.h index bb9c7a7df750..a4064e3d9d4c 100644 --- a/sys/i386/include/param.h +++ b/sys/i386/include/param.h @@ -1,147 +1,142 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)param.h 5.8 (Berkeley) 6/28/91 * $FreeBSD$ */ /* * Machine dependent constants for Intel 386. */ /* * Round p (pointer or byte index) up to a correctly-aligned value * for all data types (int, long, ...). The result is unsigned int * and must be cast to any desired pointer type. */ #ifndef _ALIGNBYTES #define _ALIGNBYTES (sizeof(int) - 1) #endif #ifndef _ALIGN #define _ALIGN(p) (((unsigned)(p) + _ALIGNBYTES) & ~_ALIGNBYTES) #endif #ifndef _MACHINE #define _MACHINE i386 #endif #ifndef _MACHINE_ARCH #define _MACHINE_ARCH i386 #endif #ifndef _NO_NAMESPACE_POLLUTION #ifndef _MACHINE_PARAM_H_ #define _MACHINE_PARAM_H_ #ifndef MACHINE #define MACHINE "i386" #endif #ifndef MACHINE_ARCH #define MACHINE_ARCH "i386" #endif #define MID_MACHINE MID_I386 #ifdef SMP #define MAXCPU 16 #else #define MAXCPU 1 #endif /* SMP */ #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) #define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */ #define PAGE_SIZE (1<> PAGE_SHIFT) #define ptoa(x) ((x) << PAGE_SHIFT) #define i386_btop(x) ((x) >> PAGE_SHIFT) #define i386_ptob(x) ((x) << PAGE_SHIFT) #define pgtok(x) ((x) * (PAGE_SIZE / 1024)) #endif /* !_MACHINE_PARAM_H_ */ #endif /* !_NO_NAMESPACE_POLLUTION */ diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index c041886663ea..eb345bff2d3f 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -1,898 +1,899 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Machine independent bits of mutex implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" #include "opt_mprof.h" #include "opt_mutex_wake_all.h" +#include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_owner(m) (mtx_unowned((m)) ? NULL \ : (struct thread *)((m)->mtx_lock & MTX_FLAGMASK)) /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { "sleep mutex", LC_SLEEPLOCK | LC_RECURSABLE }; struct lock_class lock_class_mtx_spin = { "spin mutex", LC_SPINLOCK | LC_RECURSABLE }; /* * System-wide mutexes */ struct mtx sched_lock; struct mtx Giant; #ifdef MUTEX_PROFILING SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging"); SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling"); static int mutex_prof_enable = 0; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW, &mutex_prof_enable, 0, "Enable tracing of mutex holdtime"); struct mutex_prof { const char *name; const char *file; int line; uintmax_t cnt_max; uintmax_t cnt_tot; uintmax_t cnt_cur; uintmax_t cnt_contest_holding; uintmax_t cnt_contest_locking; struct mutex_prof *next; }; /* * mprof_buf is a static pool of profiling records to avoid possible * reentrance of the memory allocation functions. * * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE. */ #ifdef MPROF_BUFFERS #define NUM_MPROF_BUFFERS MPROF_BUFFERS #else #define NUM_MPROF_BUFFERS 1000 #endif static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS]; static int first_free_mprof_buf; #ifndef MPROF_HASH_SIZE #define MPROF_HASH_SIZE 1009 #endif #if NUM_MPROF_BUFFERS >= MPROF_HASH_SIZE #error MPROF_BUFFERS must be larger than MPROF_HASH_SIZE #endif static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE]; /* SWAG: sbuf size = avg stat. line size * number of locks */ #define MPROF_SBUF_SIZE 256 * 400 static int mutex_prof_acquisitions; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD, &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded"); static int mutex_prof_records; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD, &mutex_prof_records, 0, "Number of profiling records"); static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD, &mutex_prof_maxrecords, 0, "Maximum number of profiling records"); static int mutex_prof_rejected; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD, &mutex_prof_rejected, 0, "Number of rejected profiling records"); static int mutex_prof_hashsize = MPROF_HASH_SIZE; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD, &mutex_prof_hashsize, 0, "Hash size"); static int mutex_prof_collisions = 0; SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD, &mutex_prof_collisions, 0, "Number of hash collisions"); /* * mprof_mtx protects the profiling buffers and the hash. */ static struct mtx mprof_mtx; MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET); static u_int64_t nanoseconds(void) { struct timespec tv; nanotime(&tv); return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec); } static int dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error, i; static int multiplier = 1; if (first_free_mprof_buf == 0) return (SYSCTL_OUT(req, "No locking recorded", sizeof("No locking recorded"))); retry_sbufops: sb = sbuf_new(NULL, NULL, MPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN); sbuf_printf(sb, "%6s %12s %11s %5s %12s %12s %s\n", "max", "total", "count", "avg", "cnt_hold", "cnt_lock", "name"); /* * XXX this spinlock seems to be by far the largest perpetrator * of spinlock latency (1.6 msec on an Athlon1600 was recorded * even before I pessimized it further by moving the average * computation here). */ mtx_lock_spin(&mprof_mtx); for (i = 0; i < first_free_mprof_buf; ++i) { sbuf_printf(sb, "%6ju %12ju %11ju %5ju %12ju %12ju %s:%d (%s)\n", mprof_buf[i].cnt_max / 1000, mprof_buf[i].cnt_tot / 1000, mprof_buf[i].cnt_cur, mprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 : mprof_buf[i].cnt_tot / (mprof_buf[i].cnt_cur * 1000), mprof_buf[i].cnt_contest_holding, mprof_buf[i].cnt_contest_locking, mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name); if (sbuf_overflowed(sb)) { mtx_unlock_spin(&mprof_mtx); sbuf_delete(sb); multiplier++; goto retry_sbufops; } } mtx_unlock_spin(&mprof_mtx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics"); static int reset_mutex_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; if (first_free_mprof_buf == 0) return (0); v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); mtx_lock_spin(&mprof_mtx); bzero(mprof_buf, sizeof(*mprof_buf) * first_free_mprof_buf); bzero(mprof_hash, sizeof(struct mtx *) * MPROF_HASH_SIZE); first_free_mprof_buf = 0; mtx_unlock_spin(&mprof_mtx); return (0); } SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_mutex_prof_stats, "I", "Reset mutex profiling statistics"); #endif /* * Function versions of the inlined __mtx_* macros. These are used by * modules and can also be called from assembly language if needed. */ void _mtx_lock_flags(struct mtx *m, int opts, const char *file, int line) { MPASS(curthread != NULL); KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep, ("mtx_lock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line); _get_sleep_lock(m, curthread, opts, file, line); LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); #ifdef MUTEX_PROFILING /* don't reset the timer when/if recursing */ if (m->mtx_acqtime == 0) { m->mtx_filename = file; m->mtx_lineno = line; m->mtx_acqtime = mutex_prof_enable ? nanoseconds() : 0; ++mutex_prof_acquisitions; } #endif } void _mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line) { MPASS(curthread != NULL); KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep, ("mtx_unlock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name, file, line)); WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); #ifdef MUTEX_PROFILING if (m->mtx_acqtime != 0) { static const char *unknown = "(unknown)"; struct mutex_prof *mpp; u_int64_t acqtime, now; const char *p, *q; volatile u_int hash; now = nanoseconds(); acqtime = m->mtx_acqtime; m->mtx_acqtime = 0; if (now <= acqtime) goto out; for (p = m->mtx_filename; p != NULL && strncmp(p, "../", 3) == 0; p += 3) /* nothing */ ; if (p == NULL || *p == '\0') p = unknown; for (hash = m->mtx_lineno, q = p; *q != '\0'; ++q) hash = (hash * 2 + *q) % MPROF_HASH_SIZE; mtx_lock_spin(&mprof_mtx); for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next) if (mpp->line == m->mtx_lineno && strcmp(mpp->file, p) == 0) break; if (mpp == NULL) { /* Just exit if we cannot get a trace buffer */ if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) { ++mutex_prof_rejected; goto unlock; } mpp = &mprof_buf[first_free_mprof_buf++]; mpp->name = mtx_name(m); mpp->file = p; mpp->line = m->mtx_lineno; mpp->next = mprof_hash[hash]; if (mprof_hash[hash] != NULL) ++mutex_prof_collisions; mprof_hash[hash] = mpp; ++mutex_prof_records; } /* * Record if the mutex has been held longer now than ever * before. */ if (now - acqtime > mpp->cnt_max) mpp->cnt_max = now - acqtime; mpp->cnt_tot += now - acqtime; mpp->cnt_cur++; /* * There's a small race, really we should cmpxchg * 0 with the current value, but that would bill * the contention to the wrong lock instance if * it followed this also. */ mpp->cnt_contest_holding += m->mtx_contest_holding; m->mtx_contest_holding = 0; mpp->cnt_contest_locking += m->mtx_contest_locking; m->mtx_contest_locking = 0; unlock: mtx_unlock_spin(&mprof_mtx); } out: #endif _rel_sleep_lock(m, curthread, opts, file, line); } void _mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line) { MPASS(curthread != NULL); KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin, ("mtx_lock_spin() of sleep mutex %s @ %s:%d", m->mtx_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line); #if defined(SMP) || LOCK_DEBUG > 0 || 1 _get_spin_lock(m, curthread, opts, file, line); #else critical_enter(); #endif LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); } void _mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line) { MPASS(curthread != NULL); KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin, ("mtx_unlock_spin() of sleep mutex %s @ %s:%d", m->mtx_object.lo_name, file, line)); WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); #if defined(SMP) || LOCK_DEBUG > 0 || 1 _rel_spin_lock(m); #else critical_exit(); #endif } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' If this function is called on a mutex that * is already owned, it will recursively acquire the lock. */ int _mtx_trylock(struct mtx *m, int opts, const char *file, int line) { int rval; MPASS(curthread != NULL); if (mtx_owned(m) && (m->mtx_object.lo_flags & LO_RECURSABLE) != 0) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); rval = 1; } else rval = _obtain_lock(m, curthread); LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line); if (rval) WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); return (rval); } /* * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void _mtx_lock_sleep(struct mtx *m, struct thread *td, int opts, const char *file, int line) { struct turnstile *ts; #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) struct thread *owner; #endif uintptr_t v; #ifdef KTR int cont_logged = 0; #endif #ifdef MUTEX_PROFILING int contested; #endif if (mtx_owned(m)) { KASSERT((m->mtx_object.lo_flags & LO_RECURSABLE) != 0, ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n", m->mtx_object.lo_name, file, line)); m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR4(KTR_LOCK, "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", m->mtx_object.lo_name, (void *)m->mtx_lock, file, line); #ifdef MUTEX_PROFILING contested = 0; #endif while (!_obtain_lock(m, td)) { #ifdef MUTEX_PROFILING contested = 1; atomic_add_int(&m->mtx_contest_holding, 1); #endif ts = turnstile_lookup(&m->mtx_object); v = m->mtx_lock; /* * Check if the lock has been released while spinning for * the turnstile chain lock. */ if (v == MTX_UNOWNED) { turnstile_release(&m->mtx_object); cpu_spinwait(); continue; } #ifdef MUTEX_WAKE_ALL MPASS(v != MTX_CONTESTED); #else /* * The mutex was marked contested on release. This means that * there are other threads blocked on it. Grab ownership of * it and propagate its priority to the current thread if * necessary. */ if (v == MTX_CONTESTED) { MPASS(ts != NULL); m->mtx_lock = (uintptr_t)td | MTX_CONTESTED; turnstile_claim(ts); break; } #endif /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, (void *)(v | MTX_CONTESTED))) { turnstile_release(&m->mtx_object); cpu_spinwait(); continue; } #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) /* * If the current owner of the lock is executing on another * CPU, spin instead of blocking. */ owner = (struct thread *)(v & MTX_FLAGMASK); #ifdef ADAPTIVE_GIANT if (TD_IS_RUNNING(owner)) { #else if (m != &Giant && TD_IS_RUNNING(owner)) { #endif turnstile_release(&m->mtx_object); while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) { cpu_spinwait(); } continue; } #endif /* SMP && !NO_ADAPTIVE_MUTEXES */ /* * We definitely must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef KTR if (!cont_logged) { CTR6(KTR_CONTENTION, "contention: %p at %s:%d wants %s, taken by %s:%d", td, file, line, m->mtx_object.lo_name, WITNESS_FILE(&m->mtx_object), WITNESS_LINE(&m->mtx_object)); cont_logged = 1; } #endif /* * Block on the turnstile. */ turnstile_wait(ts, &m->mtx_object, mtx_owner(m)); } #ifdef KTR if (cont_logged) { CTR4(KTR_CONTENTION, "contention end: %s acquired by %p at %s:%d", m->mtx_object.lo_name, td, file, line); } #endif #ifdef MUTEX_PROFILING if (contested) m->mtx_contest_locking++; m->mtx_contest_holding = 0; #endif return; } /* * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin(struct mtx *m, struct thread *td, int opts, const char *file, int line) { int i = 0; if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); for (;;) { if (_obtain_lock(m, td)) break; /* Give interrupts a chance while we spin. */ critical_exit(); while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 10000000) { cpu_spinwait(); continue; } if (i < 60000000) DELAY(1); else if (!kdb_active) { printf("spin lock %s held by %p for > 5 seconds\n", m->mtx_object.lo_name, (void *)m->mtx_lock); #ifdef WITNESS witness_display_spinlock(&m->mtx_object, mtx_owner(m)); #endif panic("spin lock held too long"); } cpu_spinwait(); } critical_enter(); } if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); return; } /* * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) { struct turnstile *ts; #ifndef PREEMPTION struct thread *td, *td1; #endif if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } ts = turnstile_lookup(&m->mtx_object); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) if (ts == NULL) { _release_lock_quick(m); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m); turnstile_release(&m->mtx_object); return; } #else MPASS(ts != NULL); #endif #ifndef PREEMPTION /* XXX */ td1 = turnstile_head(ts); #endif #ifdef MUTEX_WAKE_ALL turnstile_broadcast(ts); _release_lock_quick(m); #else if (turnstile_signal(ts)) { _release_lock_quick(m); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); } else { m->mtx_lock = MTX_CONTESTED; if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p still contested", m); } #endif turnstile_unpend(ts); #ifndef PREEMPTION /* * XXX: This is just a hack until preemption is done. However, * once preemption is done we need to either wrap the * turnstile_signal() and release of the actual lock in an * extra critical section or change the preemption code to * always just set a flag and never do instant-preempts. */ td = curthread; if (td->td_critnest > 0 || td1->td_priority >= td->td_priority) return; mtx_lock_spin(&sched_lock); if (!TD_IS_RUNNING(td1)) { #ifdef notyet if (td->td_ithd != NULL) { struct ithd *it = td->td_ithd; if (it->it_interrupted) { if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p interrupted %p", it, it->it_interrupted); intr_thd_fixup(it); } } #endif if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p switching out lock=%p", m, (void *)m->mtx_lock); mi_switch(SW_INVOL, NULL); if (LOCK_LOG_TEST(&m->mtx_object, opts)) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } mtx_unlock_spin(&sched_lock); #endif return; } /* * All the unlocking of MTX_SPIN locks is done inline. * See the _rel_spin_lock() macro for the details. */ /* * The backing function for the INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANT_SUPPORT void _mtx_assert(struct mtx *m, int what, const char *file, int line) { if (panicstr != NULL) return; switch (what) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned(m)) panic("mutex %s not owned at %s:%d", m->mtx_object.lo_name, file, line); if (mtx_recursed(m)) { if ((what & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", m->mtx_object.lo_name, file, line); } else if ((what & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", m->mtx_object.lo_name, file, line); } break; case MA_NOTOWNED: if (mtx_owned(m)) panic("mutex %s owned at %s:%d", m->mtx_object.lo_name, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * The MUTEX_DEBUG-enabled mtx_validate() * * Most of these checks have been moved off into the LO_INITIALIZED flag * maintained by the witness code. */ #ifdef MUTEX_DEBUG void mtx_validate(struct mtx *); void mtx_validate(struct mtx *m) { /* * XXX: When kernacc() does not require Giant we can reenable this check */ #ifdef notyet /* * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly * we can re-enable the kernacc() checks. */ #ifndef __alpha__ /* * Can't call kernacc() from early init386(), especially when * initializing Giant mutex, because some stuff in kernacc() * requires Giant itself. */ if (!cold) if (!kernacc((caddr_t)m, sizeof(m), VM_PROT_READ | VM_PROT_WRITE)) panic("Can't read and write to mutex %p", m); #endif #endif } #endif /* * General init routine used by the MTX_SYSINIT() macro. */ void mtx_sysinit(void *arg) { struct mtx_args *margs = arg; mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); } /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and name `name.' The optional * lock type `type' is used as a general lock category name for use with * witness. */ void mtx_init(struct mtx *m, const char *name, const char *type, int opts) { struct lock_object *lock; MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | MTX_NOWITNESS | MTX_DUPOK)) == 0); #ifdef MUTEX_DEBUG /* Diagnostic and error correction */ mtx_validate(m); #endif lock = &m->mtx_object; KASSERT((lock->lo_flags & LO_INITIALIZED) == 0, ("mutex \"%s\" %p already initialized", name, m)); bzero(m, sizeof(*m)); if (opts & MTX_SPIN) lock->lo_class = &lock_class_mtx_spin; else lock->lo_class = &lock_class_mtx_sleep; lock->lo_name = name; lock->lo_type = type != NULL ? type : name; if (opts & MTX_QUIET) lock->lo_flags = LO_QUIET; if (opts & MTX_RECURSE) lock->lo_flags |= LO_RECURSABLE; if ((opts & MTX_NOWITNESS) == 0) lock->lo_flags |= LO_WITNESS; if (opts & MTX_DUPOK) lock->lo_flags |= LO_DUPOK; m->mtx_lock = MTX_UNOWNED; LOCK_LOG_INIT(lock, opts); WITNESS_INIT(lock); } /* * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be * passed in as a flag here because if the corresponding mtx_init() was * called with MTX_QUIET set, then it will already be set in the mutex's * flags. */ void mtx_destroy(struct mtx *m) { LOCK_LOG_DESTROY(&m->mtx_object, 0); if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); /* Tell witness this isn't locked to make it happy. */ WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__, __LINE__); } WITNESS_DESTROY(&m->mtx_object); } /* * Intialize the mutex code and system mutexes. This is called from the MD * startup code prior to mi_startup(). The per-CPU data space needs to be * setup before this is called. */ void mutex_init(void) { /* Setup thread0 so that mutexes work. */ LIST_INIT(&thread0.td_contested); /* Setup turnstiles so that sleep mutexes work. */ init_turnstiles(); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_lock(&Giant); } diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index a720becb7e13..25e81c4e21fe 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -1,618 +1,619 @@ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdb.h" #include "opt_mac.h" #include "opt_panic.h" #include "opt_show_busybufs.h" +#include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* smp_active */ #include #include #include #include #include #include #include #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW, &debugger_on_panic, 0, "Run debugger on kernel panic"); #ifdef KDB_TRACE int trace_on_panic = 1; #else int trace_on_panic = 0; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RW, &trace_on_panic, 0, "Print stack trace on kernel panic"); #endif /* KDB */ int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; int dumping; /* system is dumping */ static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ static lwpid_t dumptid; /* Thread ID. */ static void boot(int) __dead2; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL) /* * The system call that results in a reboot * * MPSAFE */ /* ARGSUSED */ int reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_check_system_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = suser(td); if (error == 0) { mtx_lock(&Giant); boot(uap->opt); mtx_unlock(&Giant); } return (error); } /* * Called by events that want to shut down.. e.g on a PC */ static int shutdown_howto = 0; void shutdown_nice(int howto) { shutdown_howto = howto; /* Send a signal to init(8) and have it shutdown the world */ if (initproc != NULL) { PROC_LOCK(initproc); psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } else { /* No init(8) running, so simply reboot */ boot(RB_NOSYNC); } return; } static int waittime = -1; static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } static void doadump(void) { /* * Sometimes people have to call this from the kernel debugger. * (if 'panic' can not dump) * Give them a clue as to why they can't dump. */ if (dumper.dumper == NULL) { printf("Cannot dump. No dump device defined.\n"); return; } savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; dumpsys(&dumper); } /* * Go through the rigmarole of shutting down.. * this used to be in machdep.c but I'll be dammned if I could see * anything machine dependant in it. */ static void boot(int howto) { static int first_buf_printf = 1; /* collect extra flags that shutdown_nice might have set */ howto |= shutdown_howto; /* We are out of the debugger now. */ kdb_active = 0; #ifdef SMP if (smp_active) printf("boot() called on cpu#%d\n", PCPU_GET(cpuid)); #endif /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { register struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif waittime = 0; sync(&thread0, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if ((bp->b_flags & B_INVAL) == 0 && BUF_REFCNT(bp) > 0) { nbusy++; } else if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { /* bawrite(bp);*/ nbusy++; } } if (nbusy == 0) { if (first_buf_printf) printf("No buffers busy after final sync"); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; sync(&thread0, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { mtx_lock_spin(&sched_lock); mi_switch(SW_VOL, NULL); mtx_unlock_spin(&sched_lock); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (((bp->b_flags&B_INVAL) == 0 && BUF_REFCNT(bp)) || ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI)) { if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } nbusy++; #if defined(SHOW_BUSYBUFS) || defined(DIAGNOSTIC) printf( "%d: dev:%s, flags:%0x, blkno:%ld, lblkno:%ld\n", nbusy, devtoname(bp->b_dev), bp->b_flags, (long)bp->b_blkno, (long)bp->b_lblkno); #endif } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == 0) vfs_unmountall(); } DELAY(100000); /* wait for console output to finish */ } print_uptime(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); splhigh(); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: howto &= ~RB_HALT; break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (PANIC_REBOOT_WAIT_TIME != 0) { if (PANIC_REBOOT_WAIT_TIME != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", PANIC_REBOOT_WAIT_TIME); for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #ifdef SMP static u_int panic_cpu = NOCPU; #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. * * MPSAFE */ void panic(const char *fmt, ...) { struct thread *td = curthread; int bootopt, newpanic; va_list ap; static char buf[256]; #ifdef SMP /* * We don't want multiple CPU's to panic at the same time, so we * use panic_cpu as a simple spinlock. We have to keep checking * panic_cpu if we are spinning in case the panic on the first * CPU is canceled. */ if (panic_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&panic_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (panic_cpu != NOCPU) ; /* nothing */ #endif bootopt = RB_AUTOBOOT | RB_DUMP; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { panicstr = fmt; newpanic = 1; } va_start(ap, fmt); if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } va_end(ap); #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif #ifdef KDB if (newpanic && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter("panic"); #ifdef RESTARTABLE_PANICS /* See if the user aborted the panic, in which case we continue. */ if (panicstr == NULL) { #ifdef SMP atomic_store_rel_int(&panic_cpu, NOCPU); #endif return; } #endif #endif mtx_lock_spin(&sched_lock); td->td_flags |= TDF_INPANIC; mtx_unlock_spin(&sched_lock); if (!sync_on_panic) bootopt |= RB_NOSYNC; boot(bootopt); } /* * Support for poweroff delay. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, ""); static void poweroff_wait(void *junk, int howto) { if (!(howto & RB_POWEROFF) || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, ""); void kproc_shutdown(void *arg, int howto) { struct proc *p; char procname[MAXCOMLEN + 1]; int error; if (panicstr) return; p = (struct proc *)arg; strlcpy(procname, p->p_comm, sizeof(procname)); printf("Waiting (max %d seconds) for system process `%s' to stop...", kproc_shutdown_wait, procname); error = kthread_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } /* Registration of dumpers */ int set_dumper(struct dumperinfo *di) { if (di == NULL) { bzero(&dumper, sizeof dumper); return (0); } if (dumper.dumper != NULL) return (EBUSY); dumper = *di; return (0); } #if defined(__powerpc__) void dumpsys(struct dumperinfo *di __unused) { printf("Kernel dumps not implemented on this architecture\n"); } #endif diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index e36128f702c9..69cfb5f5bae1 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -1,873 +1,877 @@ /* * Copyright (c) 2001 Jake Burkholder * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*** Here is the logic.. If there are N processors, then there are at most N KSEs (kernel schedulable entities) working to process threads that belong to a KSEGROUP (kg). If there are X of these KSEs actually running at the moment in question, then there are at most M (N-X) of these KSEs on the run queue, as running KSEs are not on the queue. Runnable threads are queued off the KSEGROUP in priority order. If there are M or more threads runnable, the top M threads (by priority) are 'preassigned' to the M KSEs not running. The KSEs take their priority from those threads and are put on the run queue. The last thread that had a priority high enough to have a KSE associated with it, AND IS ON THE RUN QUEUE is pointed to by kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs assigned as all the available KSEs are activly running, or because there are no threads queued, that pointer is NULL. When a KSE is removed from the run queue to become runnable, we know it was associated with the highest priority thread in the queue (at the head of the queue). If it is also the last assigned we know M was 1 and must now be 0. Since the thread is no longer queued that pointer must be removed from it. Since we know there were no more KSEs available, (M was 1 and is now 0) and since we are not FREEING our KSE but using it, we know there are STILL no more KSEs available, we can prove that the next thread in the ksegrp list will not have a KSE to assign to it, so we can show that the pointer must be made 'invalid' (NULL). The pointer exists so that when a new thread is made runnable, it can have its priority compared with the last assigned thread to see if it should 'steal' its KSE or not.. i.e. is it 'earlier' on the list than that thread or later.. If it's earlier, then the KSE is removed from the last assigned (which is now not assigned a KSE) and reassigned to the new thread, which is placed earlier in the list. The pointer is then backed up to the previous thread (which may or may not be the new thread). When a thread sleeps or is removed, the KSE becomes available and if there are queued threads that are not assigned KSEs, the highest priority one of them is assigned the KSE, which is then placed back on the run queue at the approipriate place, and the kg->kg_last_assigned pointer is adjusted down to point to it. The following diagram shows 2 KSEs and 3 threads from a single process. RUNQ: --->KSE---KSE--... (KSEs queued at priorities from threads) \ \____ \ \ KSEGROUP---thread--thread--thread (queued in priority order) \ / \_______________/ (last_assigned) The result of this scheme is that the M available KSEs are always queued at the priorities they have inherrited from the M highest priority threads for that KSEGROUP. If this situation changes, the KSEs are reassigned to keep this true. ***/ #include __FBSDID("$FreeBSD$"); -#include "opt_full_preemption.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && (defined(__i386__) || defined(__amd64__)) #include #endif #include #if defined(SMP) && defined(SCHED_4BSD) #include #endif +#ifdef FULL_PREEMPTION +#ifndef PREEMPTION +#error "The FULL_PREEMPTION option requires the PREEMPTION option" +#endif +#endif CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); void panc(char *string1, char *string2); #if 0 static void runq_readjust(struct runq *rq, struct kse *ke); #endif /************************************************************************ * Functions that manipulate runnability from a thread perspective. * ************************************************************************/ /* * Select the KSE that will be run next. From that find the thread, and * remove it from the KSEGRP's run queue. If there is thread clustering, * this will be what does it. */ struct thread * choosethread(void) { struct kse *ke; struct thread *td; struct ksegrp *kg; #if defined(SMP) && (defined(__i386__) || defined(__amd64__)) if (smp_active == 0 && PCPU_GET(cpuid) != 0) { /* Shutting down, run idlethread on AP's */ td = PCPU_GET(idlethread); ke = td->td_kse; CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); ke->ke_flags |= KEF_DIDRUN; TD_SET_RUNNING(td); return (td); } #endif retry: ke = sched_choose(); if (ke) { td = ke->ke_thread; KASSERT((td->td_kse == ke), ("kse/thread mismatch")); kg = ke->ke_ksegrp; if (td->td_proc->p_flag & P_SA) { if (kg->kg_last_assigned == td) { kg->kg_last_assigned = TAILQ_PREV(td, threadqueue, td_runq); } TAILQ_REMOVE(&kg->kg_runq, td, td_runq); kg->kg_runnable--; } CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d", td, td->td_priority); } else { /* Simulate runq_choose() having returned the idle thread */ td = PCPU_GET(idlethread); ke = td->td_kse; CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); } ke->ke_flags |= KEF_DIDRUN; /* * If we are in panic, only allow system threads, * plus the one we are running in, to be run. */ if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 && (td->td_flags & TDF_INPANIC) == 0)) { /* note that it is no longer on the run queue */ TD_SET_CAN_RUN(td); goto retry; } TD_SET_RUNNING(td); return (td); } /* * Given a surplus KSE, either assign a new runable thread to it * (and put it in the run queue) or put it in the ksegrp's idle KSE list. * Assumes that the original thread is not runnable. */ void kse_reassign(struct kse *ke) { struct ksegrp *kg; struct thread *td; struct thread *original; mtx_assert(&sched_lock, MA_OWNED); original = ke->ke_thread; KASSERT(original == NULL || TD_IS_INHIBITED(original), ("reassigning KSE with runnable thread")); kg = ke->ke_ksegrp; if (original) original->td_kse = NULL; /* * Find the first unassigned thread */ if ((td = kg->kg_last_assigned) != NULL) td = TAILQ_NEXT(td, td_runq); else td = TAILQ_FIRST(&kg->kg_runq); /* * If we found one, assign it the kse, otherwise idle the kse. */ if (td) { kg->kg_last_assigned = td; td->td_kse = ke; ke->ke_thread = td; CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td); sched_add(td, SRQ_BORING); return; } ke->ke_state = KES_IDLE; ke->ke_thread = NULL; TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses++; CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke); return; } #if 0 /* * Remove a thread from its KSEGRP's run queue. * This in turn may remove it from a KSE if it was already assigned * to one, possibly causing a new thread to be assigned to the KSE * and the KSE getting a new priority. */ static void remrunqueue(struct thread *td) { struct thread *td2, *td3; struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue")); kg = td->td_ksegrp; ke = td->td_kse; CTR1(KTR_RUNQ, "remrunqueue: td%p", td); TD_SET_CAN_RUN(td); /* * If it is not a threaded process, take the shortcut. */ if ((td->td_proc->p_flag & P_SA) == 0) { /* Bring its kse with it, leave the thread attached */ sched_rem(td); ke->ke_state = KES_THREAD; return; } td3 = TAILQ_PREV(td, threadqueue, td_runq); TAILQ_REMOVE(&kg->kg_runq, td, td_runq); kg->kg_runnable--; if (ke) { /* * This thread has been assigned to a KSE. * We need to dissociate it and try assign the * KSE to the next available thread. Then, we should * see if we need to move the KSE in the run queues. */ sched_rem(td); ke->ke_state = KES_THREAD; td2 = kg->kg_last_assigned; KASSERT((td2 != NULL), ("last assigned has wrong value")); if (td2 == td) kg->kg_last_assigned = td3; kse_reassign(ke); } } #endif /* * Change the priority of a thread that is on the run queue. */ void adjustrunqueue( struct thread *td, int newpri) { struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue")); ke = td->td_kse; CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td); /* * If it is not a threaded process, take the shortcut. */ if ((td->td_proc->p_flag & P_SA) == 0) { /* We only care about the kse in the run queue. */ td->td_priority = newpri; if (ke->ke_rqindex != (newpri / RQ_PPQ)) { sched_rem(td); sched_add(td, SRQ_BORING); } return; } /* It is a threaded process */ kg = td->td_ksegrp; TD_SET_CAN_RUN(td); if (ke) { if (kg->kg_last_assigned == td) { kg->kg_last_assigned = TAILQ_PREV(td, threadqueue, td_runq); } sched_rem(td); } TAILQ_REMOVE(&kg->kg_runq, td, td_runq); kg->kg_runnable--; td->td_priority = newpri; setrunqueue(td, SRQ_BORING); } void setrunqueue(struct thread *td, int flags) { struct kse *ke; struct ksegrp *kg; struct thread *td2; struct thread *tda; int count; CTR4(KTR_RUNQ, "setrunqueue: td:%p ke:%p kg:%p pid:%d", td, td->td_kse, td->td_ksegrp, td->td_proc->p_pid); mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("setrunqueue: bad thread state")); TD_SET_RUNQ(td); kg = td->td_ksegrp; if ((td->td_proc->p_flag & P_SA) == 0) { /* * Common path optimisation: Only one of everything * and the KSE is always already attached. * Totally ignore the ksegrp run queue. */ sched_add(td, flags); return; } tda = kg->kg_last_assigned; if ((ke = td->td_kse) == NULL) { if (kg->kg_idle_kses) { /* * There is a free one so it's ours for the asking.. */ ke = TAILQ_FIRST(&kg->kg_iq); CTR2(KTR_RUNQ, "setrunqueue: kg:%p: Use free ke:%p", kg, ke); TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); ke->ke_state = KES_THREAD; kg->kg_idle_kses--; } else if (tda && (tda->td_priority > td->td_priority)) { /* * None free, but there is one we can commandeer. */ ke = tda->td_kse; CTR3(KTR_RUNQ, "setrunqueue: kg:%p: take ke:%p from td: %p", kg, ke, tda); sched_rem(tda); tda->td_kse = NULL; ke->ke_thread = NULL; tda = kg->kg_last_assigned = TAILQ_PREV(tda, threadqueue, td_runq); } } else { /* * Temporarily disassociate so it looks like the other cases. */ ke->ke_thread = NULL; td->td_kse = NULL; } /* * Add the thread to the ksegrp's run queue at * the appropriate place. */ count = 0; TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) { if (td2->td_priority > td->td_priority) { kg->kg_runnable++; TAILQ_INSERT_BEFORE(td2, td, td_runq); break; } /* XXX Debugging hack */ if (++count > 10000) { printf("setrunqueue(): corrupt kq_runq, td= %p\n", td); panic("deadlock in setrunqueue"); } } if (td2 == NULL) { /* We ran off the end of the TAILQ or it was empty. */ kg->kg_runnable++; TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq); } /* * If we have a ke to use, then put it on the run queue and * If needed, readjust the last_assigned pointer. */ if (ke) { if (tda == NULL) { /* * No pre-existing last assigned so whoever is first * gets the KSE we brought in.. (maybe us) */ td2 = TAILQ_FIRST(&kg->kg_runq); KASSERT((td2->td_kse == NULL), ("unexpected ke present")); td2->td_kse = ke; ke->ke_thread = td2; kg->kg_last_assigned = td2; } else if (tda->td_priority > td->td_priority) { /* * It's ours, grab it, but last_assigned is past us * so don't change it. */ td->td_kse = ke; ke->ke_thread = td; } else { /* * We are past last_assigned, so * put the new kse on whatever is next, * which may or may not be us. */ td2 = TAILQ_NEXT(tda, td_runq); kg->kg_last_assigned = td2; td2->td_kse = ke; ke->ke_thread = td2; } sched_add(ke->ke_thread, flags); } else { CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d", td, td->td_ksegrp, td->td_proc->p_pid); } } /* * Kernel thread preemption implementation. Critical sections mark * regions of code in which preemptions are not allowed. */ void critical_enter(void) { struct thread *td; td = curthread; if (td->td_critnest == 0) cpu_critical_enter(td); td->td_critnest++; } void critical_exit(void) { struct thread *td; td = curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); if (td->td_critnest == 1) { #ifdef PREEMPTION mtx_assert(&sched_lock, MA_NOTOWNED); if (td->td_pflags & TDP_OWEPREEMPT) { mtx_lock_spin(&sched_lock); mi_switch(SW_INVOL, NULL); mtx_unlock_spin(&sched_lock); } #endif td->td_critnest = 0; cpu_critical_exit(td); } else { td->td_critnest--; } } /* * This function is called when a thread is about to be put on run queue * because it has been made runnable or its priority has been adjusted. It * determines if the new thread should be immediately preempted to. If so, * it switches to it and eventually returns true. If not, it returns false * so that the caller may place the thread on an appropriate run queue. */ int maybe_preempt(struct thread *td) { #ifdef PREEMPTION struct thread *ctd; int cpri, pri; #endif mtx_assert(&sched_lock, MA_OWNED); #ifdef PREEMPTION /* * The new thread should not preempt the current thread if any of the * following conditions are true: * * - The current thread has a higher (numerically lower) or * equivalent priority. Note that this prevents curthread from * trying to preempt to itself. * - It is too early in the boot for context switches (cold is set). * - The current thread has an inhibitor set or is in the process of * exiting. In this case, the current thread is about to switch * out anyways, so there's no point in preempting. If we did, * the current thread would not be properly resumed as well, so * just avoid that whole landmine. * - If the new thread's priority is not a realtime priority and * the current thread's priority is not an idle priority and * FULL_PREEMPTION is disabled. * * If all of these conditions are false, but the current thread is in * a nested critical section, then we have to defer the preemption * until we exit the critical section. Otherwise, switch immediately * to the new thread. */ ctd = curthread; if (ctd->td_kse == NULL || ctd->td_kse->ke_thread != ctd) return (0); pri = td->td_priority; cpri = ctd->td_priority; if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD) return (0); #ifndef FULL_PREEMPTION if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) && !(cpri >= PRI_MIN_IDLE)) return (0); #endif if (ctd->td_critnest > 1) { CTR1(KTR_PROC, "maybe_preempt: in critical section %d", ctd->td_critnest); ctd->td_pflags |= TDP_OWEPREEMPT; return (0); } /* * Our thread state says that we are already on a run queue, so * update our state as if we had been dequeued by choosethread(). */ MPASS(TD_ON_RUNQ(td)); TD_SET_RUNNING(td); CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, td->td_proc->p_pid, td->td_proc->p_comm); mi_switch(SW_INVOL, td); return (1); #else return (0); #endif } #if 0 #ifndef PREEMPTION /* XXX: There should be a non-static version of this. */ static void printf_caddr_t(void *data) { printf("%s", (char *)data); } static char preempt_warning[] = "WARNING: Kernel preemption is disabled, expect reduced performance.\n"; SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t, preempt_warning) #endif #endif /************************************************************************ * SYSTEM RUN QUEUE manipulations and tests * ************************************************************************/ /* * Initialize a run structure. */ void runq_init(struct runq *rq) { int i; bzero(rq, sizeof *rq); for (i = 0; i < RQ_NQS; i++) TAILQ_INIT(&rq->rq_queues[i]); } /* * Clear the status bit of the queue corresponding to priority level pri, * indicating that it is empty. */ static __inline void runq_clrbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); } /* * Find the index of the first non-empty run queue. This is done by * scanning the status bits, a set bit indicates a non-empty queue. */ static __inline int runq_findbit(struct runq *rq) { struct rqbits *rqb; int pri; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", rqb->rqb_bits[i], i, pri); return (pri); } return (-1); } /* * Set the status bit of the queue corresponding to priority level pri, * indicating that it is non-empty. */ static __inline void runq_setbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); } /* * Add the KSE to the queue specified by its priority, and set the * corresponding status bit. */ void runq_add(struct runq *rq, struct kse *ke) { struct rqhead *rqh; int pri; pri = ke->ke_thread->td_priority / RQ_PPQ; ke->ke_rqindex = pri; runq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p", ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh); TAILQ_INSERT_TAIL(rqh, ke, ke_procq); } /* * Return true if there are runnable processes of any priority on the run * queue, false otherwise. Has no side effects, does not modify the run * queue structure. */ int runq_check(struct runq *rq) { struct rqbits *rqb; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", rqb->rqb_bits[i], i); return (1); } CTR0(KTR_RUNQ, "runq_check: empty"); return (0); } #if defined(SMP) && defined(SCHED_4BSD) int runq_fuzz = 1; SYSCTL_DECL(_kern_sched); SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); #endif /* * Find the highest priority process on the run queue. */ struct kse * runq_choose(struct runq *rq) { struct rqhead *rqh; struct kse *ke; int pri; mtx_assert(&sched_lock, MA_OWNED); while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; #if defined(SMP) && defined(SCHED_4BSD) /* fuzz == 1 is normal.. 0 or less are ignored */ if (runq_fuzz > 1) { /* * In the first couple of entries, check if * there is one for our CPU as a preference. */ int count = runq_fuzz; int cpu = PCPU_GET(cpuid); struct kse *ke2; ke2 = ke = TAILQ_FIRST(rqh); while (count-- && ke2) { if (ke->ke_thread->td_lastcpu == cpu) { ke = ke2; break; } ke2 = TAILQ_NEXT(ke2, ke_procq); } } else #endif ke = TAILQ_FIRST(rqh); KASSERT(ke != NULL, ("runq_choose: no proc on busy queue")); CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh); return (ke); } CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri); return (NULL); } /* * Remove the KSE from the queue specified by its priority, and clear the * corresponding status bit if the queue becomes empty. * Caller must set ke->ke_state afterwards. */ void runq_remove(struct runq *rq, struct kse *ke) { struct rqhead *rqh; int pri; KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_remove: process swapped out")); pri = ke->ke_rqindex; rqh = &rq->rq_queues[pri]; CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p", ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh); KASSERT(ke != NULL, ("runq_remove: no proc on busy queue")); TAILQ_REMOVE(rqh, ke, ke_procq); if (TAILQ_EMPTY(rqh)) { CTR0(KTR_RUNQ, "runq_remove: empty"); runq_clrbit(rq, pri); } } #if 0 void panc(char *string1, char *string2) { printf("%s", string1); kdb_enter(string2); } void thread_sanity_check(struct thread *td, char *string) { struct proc *p; struct ksegrp *kg; struct kse *ke; struct thread *td2 = NULL; unsigned int prevpri; int saw_lastassigned = 0; int unassigned = 0; int assigned = 0; p = td->td_proc; kg = td->td_ksegrp; ke = td->td_kse; if (ke) { if (p != ke->ke_proc) { panc(string, "wrong proc"); } if (ke->ke_thread != td) { panc(string, "wrong thread"); } } if ((p->p_flag & P_SA) == 0) { if (ke == NULL) { panc(string, "non KSE thread lost kse"); } } else { prevpri = 0; saw_lastassigned = 0; unassigned = 0; assigned = 0; TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) { if (td2->td_priority < prevpri) { panc(string, "thread runqueue unosorted"); } if ((td2->td_state == TDS_RUNQ) && td2->td_kse && (td2->td_kse->ke_state != KES_ONRUNQ)) { panc(string, "KSE wrong state"); } prevpri = td2->td_priority; if (td2->td_kse) { assigned++; if (unassigned) { panc(string, "unassigned before assigned"); } if (kg->kg_last_assigned == NULL) { panc(string, "lastassigned corrupt"); } if (saw_lastassigned) { panc(string, "last assigned not last"); } if (td2->td_kse->ke_thread != td2) { panc(string, "mismatched kse/thread"); } } else { unassigned++; } if (td2 == kg->kg_last_assigned) { saw_lastassigned = 1; if (td2->td_kse == NULL) { panc(string, "last assigned not assigned"); } } } if (kg->kg_last_assigned && (saw_lastassigned == 0)) { panc(string, "where on earth does lastassigned point?"); } #if 0 FOREACH_THREAD_IN_GROUP(kg, td2) { if (((td2->td_flags & TDF_UNBOUND) == 0) && (TD_ON_RUNQ(td2))) { assigned++; if (td2->td_kse == NULL) { panc(string, "BOUND thread with no KSE"); } } } #endif #if 0 if ((unassigned + assigned) != kg->kg_runnable) { panc(string, "wrong number in runnable"); } #endif } if (assigned == 12345) { printf("%p %p %p %p %p %d, %d", td, td2, ke, kg, p, assigned, saw_lastassigned); } } #endif diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 5582a40f0d90..e235c1f9b752 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1,1837 +1,1851 @@ /*- * Copyright (c) 2002-2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #define KTR_ULE KTR_NFS /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, "Scheduler name"); static int slice_min = 1; SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); static int slice_max = 10; SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); int realstathz; int tickincr = 1; +#ifdef PREEMPTION +static void +printf_caddr_t(void *data) +{ + printf("%s", (char *)data); +} +static char preempt_warning[] = + "WARNING: Kernel PREEMPTION is unstable under SCHED_ULE.\n"; +SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t, + preempt_warning) +#endif + /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ /* CPU that we have affinity for. */ u_char ske_cpu; }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks #define ke_cpu ke_sched->ske_cpu #define ke_assign ke_procq.tqe_next #define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ #define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ #define KEF_XFERABLE KEF_SCHED2 /* KSE was added as transferable. */ #define KEF_HOLD KEF_SCHED3 /* KSE is temporarily bound. */ struct kg_sched { int skg_slptime; /* Number of ticks we vol. slept */ int skg_runtime; /* Number of ticks we were running */ }; #define kg_slptime kg_sched->skg_slptime #define kg_runtime kg_sched->skg_runtime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct td_sched td_sched; struct ke_sched ke_sched; struct kg_sched kg_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * The priority is primarily determined by the interactivity score. Thus, we * give lower(better) priorities to kse groups that use less CPU. The nice * value is then directly added to this to allow nice to have some effect * on latency. * * PRI_RANGE: Total priority range for timeshare threads. * PRI_NRESV: Number of nice values. * PRI_BASE: The start of the dynamic range. */ #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) #define SCHED_PRI_INTERACT(score) \ ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) /* * These determine the interactivity of a process. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshhold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) #define SCHED_SLP_RUN_FORK ((hz / 2) << 10) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per val in the range of [0, max]. * SLICE_NICE: Determine the amount of slice granted to a scaled nice. * SLICE_NTHRESH: The nice cutoff point for slice assignment. */ #define SCHED_SLICE_MIN (slice_min) #define SCHED_SLICE_MAX (slice_max) #define SCHED_SLICE_INTERACTIVE (slice_max) #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_SLICE_NICE(nice) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) /* * This macro determines whether or not the kse belongs on the current or * next run queue. */ #define SCHED_INTERACTIVE(kg) \ (sched_interact_score(kg) < SCHED_INTERACT_THRESH) #define SCHED_CURR(kg, ke) \ (ke->ke_thread->td_priority < kg->kg_user_pri || \ SCHED_INTERACTIVE(kg)) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 10 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - per processor runqs and statistics. */ struct kseq { struct runq ksq_idle; /* Queue of IDLE threads. */ struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ struct runq *ksq_next; /* Next timeshare queue. */ struct runq *ksq_curr; /* Current queue. */ int ksq_load_timeshare; /* Load for timeshare. */ int ksq_load; /* Aggregate load. */ short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ short ksq_nicemin; /* Least nice. */ #ifdef SMP int ksq_transferable; LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ struct kseq_group *ksq_group; /* Our processor group. */ volatile struct kse *ksq_assigned; /* assigned by another CPU. */ #else int ksq_sysload; /* For loadavg, !ITHD load. */ #endif }; #ifdef SMP /* * kseq groups are groups of processors which can cheaply share threads. When * one processor in the group goes idle it will check the runqs of the other * processors in its group prior to halting and waiting for an interrupt. * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. * In a numa environment we'd want an idle bitmap per group and a two tiered * load balancer. */ struct kseq_group { int ksg_cpus; /* Count of CPUs in this kseq group. */ cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ cpumask_t ksg_idlemask; /* Idle cpus in this group. */ cpumask_t ksg_mask; /* Bit mask for first cpu. */ int ksg_load; /* Total load of this group. */ int ksg_transferable; /* Transferable load of this group. */ LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ }; #endif /* * One kse queue per processor. */ #ifdef SMP static cpumask_t kseq_idle; static int ksg_maxid; static struct kseq kseq_cpu[MAXCPU]; static struct kseq_group kseq_groups[MAXCPU]; static int bal_tick; static int gbal_tick; #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (&kseq_cpu[(x)]) #define KSEQ_ID(x) ((x) - kseq_cpu) #define KSEQ_GROUP(x) (&kseq_groups[(x)]) #else /* !SMP */ static struct kseq kseq_cpu; #define KSEQ_SELF() (&kseq_cpu) #define KSEQ_CPU(x) (&kseq_cpu) #endif static void sched_add_internal(struct thread *td, int preemptive); static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); static void sched_interact_update(struct ksegrp *kg); static void sched_interact_fork(struct ksegrp *kg); static void sched_pctcpu_update(struct kse *ke); /* Operations on per processor queues */ static struct kse * kseq_choose(struct kseq *kseq); static void kseq_setup(struct kseq *kseq); static void kseq_load_add(struct kseq *kseq, struct kse *ke); static void kseq_load_rem(struct kseq *kseq, struct kse *ke); static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); void kseq_print(int cpu); #ifdef SMP static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); static struct kse *runq_steal(struct runq *rq); static void sched_balance(void); static void sched_balance_groups(void); static void sched_balance_group(struct kseq_group *ksg); static void sched_balance_pair(struct kseq *high, struct kseq *low); static void kseq_move(struct kseq *from, int cpu); static int kseq_idled(struct kseq *kseq); static void kseq_notify(struct kse *ke, int cpu); static void kseq_assign(struct kseq *); static struct kse *kseq_steal(struct kseq *kseq, int stealidle); /* * On P4 Xeons the round-robin interrupt delivery is broken. As a result of * this, we can't pin interrupts to the cpu that they were delivered to, * otherwise all ithreads only run on CPU 0. */ #ifdef __i386__ #define KSE_CAN_MIGRATE(ke, class) \ ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) #else /* !__i386__ */ #define KSE_CAN_MIGRATE(ke, class) \ ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ ((ke)->ke_flags & KEF_BOUND) == 0) #endif /* !__i386__ */ #endif void kseq_print(int cpu) { struct kseq *kseq; int i; kseq = KSEQ_CPU(cpu); printf("kseq:\n"); printf("\tload: %d\n", kseq->ksq_load); printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); #ifdef SMP printf("\tload transferable: %d\n", kseq->ksq_transferable); #endif printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); printf("\tnice counts:\n"); for (i = 0; i < SCHED_PRI_NRESV; i++) if (kseq->ksq_nice[i]) printf("\t\t%d = %d\n", i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); } static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke) { #ifdef SMP if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; ke->ke_flags |= KEF_XFERABLE; } #endif runq_add(ke->ke_runq, ke); } static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke) { #ifdef SMP if (ke->ke_flags & KEF_XFERABLE) { kseq->ksq_transferable--; kseq->ksq_group->ksg_transferable--; ke->ke_flags &= ~KEF_XFERABLE; } #endif runq_remove(ke->ke_runq, ke); } static void kseq_load_add(struct kseq *kseq, struct kse *ke) { int class; mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); if (class == PRI_TIMESHARE) kseq->ksq_load_timeshare++; kseq->ksq_load++; if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP kseq->ksq_group->ksg_load++; #else kseq->ksq_sysload++; #endif if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, ke->ke_proc->p_nice, kseq->ksq_nicemin); if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_add(kseq, ke->ke_proc->p_nice); } static void kseq_load_rem(struct kseq *kseq, struct kse *ke) { int class; mtx_assert(&sched_lock, MA_OWNED); class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); if (class == PRI_TIMESHARE) kseq->ksq_load_timeshare--; if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) #ifdef SMP kseq->ksq_group->ksg_load--; #else kseq->ksq_sysload--; #endif kseq->ksq_load--; ke->ke_runq = NULL; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, ke->ke_proc->p_nice); } static void kseq_nice_add(struct kseq *kseq, int nice) { mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) kseq->ksq_nicemin = nice; } static void kseq_nice_rem(struct kseq *kseq, int nice) { int n; mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ n = nice + SCHED_PRI_NHALF; kseq->ksq_nice[n]--; KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); /* * If this wasn't the smallest nice value or there are more in * this bucket we can just return. Otherwise we have to recalculate * the smallest nice. */ if (nice != kseq->ksq_nicemin || kseq->ksq_nice[n] != 0 || kseq->ksq_load_timeshare == 0) return; for (; n < SCHED_PRI_NRESV; n++) if (kseq->ksq_nice[n]) { kseq->ksq_nicemin = n - SCHED_PRI_NHALF; return; } } #ifdef SMP /* * sched_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load * by migrating some processes. * * Dealing only with two CPUs at a time has two advantages. Firstly, most * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses * algorithm simplicity and more gradual effects on load in larger systems. * * It could be improved by considering the priorities and slices assigned to * each task prior to balancing them. There are many pathological cases with * any approach and so the semi random algorithm below may work as well as any. * */ static void sched_balance(void) { struct kseq_group *high; struct kseq_group *low; struct kseq_group *ksg; int cnt; int i; if (smp_started == 0) goto out; low = high = NULL; i = random() % (ksg_maxid + 1); for (cnt = 0; cnt <= ksg_maxid; cnt++) { ksg = KSEQ_GROUP(i); /* * Find the CPU with the highest load that has some * threads to transfer. */ if ((high == NULL || ksg->ksg_load > high->ksg_load) && ksg->ksg_transferable) high = ksg; if (low == NULL || ksg->ksg_load < low->ksg_load) low = ksg; if (++i > ksg_maxid) i = 0; } if (low != NULL && high != NULL && high != low) sched_balance_pair(LIST_FIRST(&high->ksg_members), LIST_FIRST(&low->ksg_members)); out: bal_tick = ticks + (random() % (hz * 2)); } static void sched_balance_groups(void) { int i; mtx_assert(&sched_lock, MA_OWNED); if (smp_started) for (i = 0; i <= ksg_maxid; i++) sched_balance_group(KSEQ_GROUP(i)); gbal_tick = ticks + (random() % (hz * 2)); } static void sched_balance_group(struct kseq_group *ksg) { struct kseq *kseq; struct kseq *high; struct kseq *low; int load; if (ksg->ksg_transferable == 0) return; low = NULL; high = NULL; LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { load = kseq->ksq_load; if (high == NULL || load > high->ksq_load) high = kseq; if (low == NULL || load < low->ksq_load) low = kseq; } if (high != NULL && low != NULL && high != low) sched_balance_pair(high, low); } static void sched_balance_pair(struct kseq *high, struct kseq *low) { int transferable; int high_load; int low_load; int move; int diff; int i; /* * If we're transfering within a group we have to use this specific * kseq's transferable count, otherwise we can steal from other members * of the group. */ if (high->ksq_group == low->ksq_group) { transferable = high->ksq_transferable; high_load = high->ksq_load; low_load = low->ksq_load; } else { transferable = high->ksq_group->ksg_transferable; high_load = high->ksq_group->ksg_load; low_load = low->ksq_group->ksg_load; } if (transferable == 0) return; /* * Determine what the imbalance is and then adjust that to how many * kses we actually have to give up (transferable). */ diff = high_load - low_load; move = diff / 2; if (diff & 0x1) move++; move = min(move, transferable); for (i = 0; i < move; i++) kseq_move(high, KSEQ_ID(low)); return; } static void kseq_move(struct kseq *from, int cpu) { struct kseq *kseq; struct kseq *to; struct kse *ke; kseq = from; to = KSEQ_CPU(cpu); ke = kseq_steal(kseq, 1); if (ke == NULL) { struct kseq_group *ksg; ksg = kseq->ksq_group; LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { if (kseq == from || kseq->ksq_transferable == 0) continue; ke = kseq_steal(kseq, 1); break; } if (ke == NULL) panic("kseq_move: No KSEs available with a " "transferable count of %d\n", ksg->ksg_transferable); } if (kseq == to) return; ke->ke_state = KES_THREAD; kseq_runq_rem(kseq, ke); kseq_load_rem(kseq, ke); kseq_notify(ke, cpu); } static int kseq_idled(struct kseq *kseq) { struct kseq_group *ksg; struct kseq *steal; struct kse *ke; ksg = kseq->ksq_group; /* * If we're in a cpu group, try and steal kses from another cpu in * the group before idling. */ if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { if (steal == kseq || steal->ksq_transferable == 0) continue; ke = kseq_steal(steal, 0); if (ke == NULL) continue; ke->ke_state = KES_THREAD; kseq_runq_rem(steal, ke); kseq_load_rem(steal, ke); ke->ke_cpu = PCPU_GET(cpuid); sched_add_internal(ke->ke_thread, 0); return (0); } } /* * We only set the idled bit when all of the cpus in the group are * idle. Otherwise we could get into a situation where a KSE bounces * back and forth between two idle cores on seperate physical CPUs. */ ksg->ksg_idlemask |= PCPU_GET(cpumask); if (ksg->ksg_idlemask != ksg->ksg_cpumask) return (1); atomic_set_int(&kseq_idle, ksg->ksg_mask); return (1); } static void kseq_assign(struct kseq *kseq) { struct kse *nke; struct kse *ke; do { *(volatile struct kse **)&ke = kseq->ksq_assigned; } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); for (; ke != NULL; ke = nke) { nke = ke->ke_assign; ke->ke_flags &= ~KEF_ASSIGNED; sched_add_internal(ke->ke_thread, 0); } } static void kseq_notify(struct kse *ke, int cpu) { struct kseq *kseq; struct thread *td; struct pcpu *pcpu; int prio; ke->ke_cpu = cpu; ke->ke_flags |= KEF_ASSIGNED; prio = ke->ke_thread->td_priority; kseq = KSEQ_CPU(cpu); /* * Place a KSE on another cpu's queue and force a resched. */ do { *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); /* * Without sched_lock we could lose a race where we set NEEDRESCHED * on a thread that is switched out before the IPI is delivered. This * would lead us to miss the resched. This will be a problem once * sched_lock is pushed down. */ pcpu = pcpu_find(cpu); td = pcpu->pc_curthread; if (ke->ke_thread->td_priority < td->td_priority || td == pcpu->pc_idlethread) { td->td_flags |= TDF_NEEDRESCHED; ipi_selected(1 << cpu, IPI_AST); } } static struct kse * runq_steal(struct runq *rq) { struct rqhead *rqh; struct rqbits *rqb; struct kse *ke; int word; int bit; mtx_assert(&sched_lock, MA_OWNED); rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(ke, rqh, ke_procq) { if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) return (ke); } } } return (NULL); } static struct kse * kseq_steal(struct kseq *kseq, int stealidle) { struct kse *ke; /* * Steal from next first to try to get a non-interactive task that * may not have run for a while. */ if ((ke = runq_steal(kseq->ksq_next)) != NULL) return (ke); if ((ke = runq_steal(kseq->ksq_curr)) != NULL) return (ke); if (stealidle) return (runq_steal(&kseq->ksq_idle)); return (NULL); } int kseq_transfer(struct kseq *kseq, struct kse *ke, int class) { struct kseq_group *ksg; int cpu; if (smp_started == 0) return (0); cpu = 0; /* * If our load exceeds a certain threshold we should attempt to * reassign this thread. The first candidate is the cpu that * originally ran the thread. If it is idle, assign it there, * otherwise, pick an idle cpu. * * The threshold at which we start to reassign kses has a large impact * on the overall performance of the system. Tuned too high and * some CPUs may idle. Too low and there will be excess migration * and context switches. */ ksg = kseq->ksq_group; if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) { ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group; if (kseq_idle & ksg->ksg_mask) { cpu = ffs(ksg->ksg_idlemask); if (cpu) goto migrate; } /* * Multiple cpus could find this bit simultaneously * but the race shouldn't be terrible. */ cpu = ffs(kseq_idle); if (cpu) goto migrate; } /* * If another cpu in this group has idled, assign a thread over * to them after checking to see if there are idled groups. */ ksg = kseq->ksq_group; if (ksg->ksg_idlemask) { cpu = ffs(ksg->ksg_idlemask); if (cpu) goto migrate; } /* * No new CPU was found. */ return (0); migrate: /* * Now that we've found an idle CPU, migrate the thread. */ cpu--; ke->ke_runq = NULL; kseq_notify(ke, cpu); return (1); } #endif /* SMP */ /* * Pick the highest priority task we have and return it. */ static struct kse * kseq_choose(struct kseq *kseq) { struct kse *ke; struct runq *swap; mtx_assert(&sched_lock, MA_OWNED); swap = NULL; for (;;) { ke = runq_choose(kseq->ksq_curr); if (ke == NULL) { /* * We already swapped once and didn't get anywhere. */ if (swap) break; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; continue; } /* * If we encounter a slice of 0 the kse is in a * TIMESHARE kse group and its nice was too far out * of the range that receives slices. */ if (ke->ke_slice == 0) { runq_remove(ke->ke_runq, ke); sched_slice(ke); ke->ke_runq = kseq->ksq_next; runq_add(ke->ke_runq, ke); continue; } return (ke); } return (runq_choose(&kseq->ksq_idle)); } static void kseq_setup(struct kseq *kseq) { runq_init(&kseq->ksq_timeshare[0]); runq_init(&kseq->ksq_timeshare[1]); runq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; kseq->ksq_load = 0; kseq->ksq_load_timeshare = 0; } static void sched_setup(void *dummy) { #ifdef SMP int balance_groups; int i; #endif slice_min = (hz/100); /* 10ms */ slice_max = (hz/7); /* ~140ms */ #ifdef SMP balance_groups = 0; /* * Initialize the kseqs. */ for (i = 0; i < MAXCPU; i++) { struct kseq *ksq; ksq = &kseq_cpu[i]; ksq->ksq_assigned = NULL; kseq_setup(&kseq_cpu[i]); } if (smp_topology == NULL) { struct kseq_group *ksg; struct kseq *ksq; for (i = 0; i < MAXCPU; i++) { ksq = &kseq_cpu[i]; ksg = &kseq_groups[i]; /* * Setup a kseq group with one member. */ ksq->ksq_transferable = 0; ksq->ksq_group = ksg; ksg->ksg_cpus = 1; ksg->ksg_idlemask = 0; ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; ksg->ksg_load = 0; ksg->ksg_transferable = 0; LIST_INIT(&ksg->ksg_members); LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); } } else { struct kseq_group *ksg; struct cpu_group *cg; int j; for (i = 0; i < smp_topology->ct_count; i++) { cg = &smp_topology->ct_group[i]; ksg = &kseq_groups[i]; /* * Initialize the group. */ ksg->ksg_idlemask = 0; ksg->ksg_load = 0; ksg->ksg_transferable = 0; ksg->ksg_cpus = cg->cg_count; ksg->ksg_cpumask = cg->cg_mask; LIST_INIT(&ksg->ksg_members); /* * Find all of the group members and add them. */ for (j = 0; j < MAXCPU; j++) { if ((cg->cg_mask & (1 << j)) != 0) { if (ksg->ksg_mask == 0) ksg->ksg_mask = 1 << j; kseq_cpu[j].ksq_transferable = 0; kseq_cpu[j].ksq_group = ksg; LIST_INSERT_HEAD(&ksg->ksg_members, &kseq_cpu[j], ksq_siblings); } } if (ksg->ksg_cpus > 1) balance_groups = 1; } ksg_maxid = smp_topology->ct_count - 1; } /* * Stagger the group and global load balancer so they do not * interfere with each other. */ bal_tick = ticks + hz; if (balance_groups) gbal_tick = ticks + (hz / 2); #else kseq_setup(KSEQ_SELF()); #endif mtx_lock_spin(&sched_lock); kseq_load_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return; pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); pri += SCHED_PRI_BASE; pri += kg->kg_proc->p_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return; } /* * Calculate a time slice based on the properties of the kseg and the runq * that we're on. This is only for PRI_TIMESHARE ksegrps. */ static void sched_slice(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; kg = ke->ke_ksegrp; kseq = KSEQ_CPU(ke->ke_cpu); /* * Rationale: * KSEs in interactive ksegs get a minimal slice so that we * quickly notice if it abuses its advantage. * * KSEs in non-interactive ksegs are assigned a slice that is * based on the ksegs nice value relative to the least nice kseg * on the run queue for this cpu. * * If the KSE is less nice than all others it gets the maximum * slice and other KSEs will adjust their slice relative to * this when they first expire. * * There is 20 point window that starts relative to the least * nice kse on the run queue. Slice size is determined by * the kse distance from the last nice ksegrp. * * If the kse is outside of the window it will get no slice * and will be reevaluated each time it is selected on the * run queue. The exception to this is nice 0 ksegs when * a nice -20 is running. They are always granted a minimum * slice. */ if (!SCHED_INTERACTIVE(kg)) { int nice; nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); if (kseq->ksq_load_timeshare == 0 || kg->kg_proc->p_nice < kseq->ksq_nicemin) ke->ke_slice = SCHED_SLICE_MAX; else if (nice <= SCHED_SLICE_NTHRESH) ke->ke_slice = SCHED_SLICE_NICE(nice); else if (kg->kg_proc->p_nice == 0) ke->ke_slice = SCHED_SLICE_MIN; else ke->ke_slice = 0; } else ke->ke_slice = SCHED_SLICE_INTERACTIVE; CTR6(KTR_ULE, "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin, kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. * This routine will not operate correctly when slp or run times have been * adjusted to more than double their maximum. */ static void sched_interact_update(struct ksegrp *kg) { int sum; sum = kg->kg_runtime + kg->kg_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { kg->kg_runtime /= 2; kg->kg_slptime /= 2; return; } kg->kg_runtime = (kg->kg_runtime / 5) * 4; kg->kg_slptime = (kg->kg_slptime / 5) * 4; } static void sched_interact_fork(struct ksegrp *kg) { int ratio; int sum; sum = kg->kg_runtime + kg->kg_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; kg->kg_runtime /= ratio; kg->kg_slptime /= ratio; } } static int sched_interact_score(struct ksegrp *kg) { int div; if (kg->kg_runtime > kg->kg_slptime) { div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); } if (kg->kg_slptime > kg->kg_runtime) { div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); return (kg->kg_runtime / div); } /* * This can happen if slptime and runtime are 0. */ return (0); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most SCHED_SLICE_MAX. */ int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } static void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. */ if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { /* * Shift the tick count out so that the divide doesn't * round away our results. */ ke->ke_ticks <<= 10; ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ticks >>= 10; } else ke->ke_ticks = 0; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } void sched_prio(struct thread *td, u_char prio) { struct kse *ke; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. We still call adjustrunqueue below in case kse * needs to fix things up. */ if (prio < td->td_priority && ke && (ke->ke_flags & KEF_ASSIGNED) == 0 && ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { runq_remove(ke->ke_runq, ke); ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; runq_add(ke->ke_runq, ke); } /* * Hold this kse on this cpu so that sched_prio() doesn't * cause excessive migration. We only want migration to * happen as the result of a wakeup. */ ke->ke_flags |= KEF_HOLD; adjustrunqueue(td, prio); } else td->td_priority = prio; } void sched_switch(struct thread *td, struct thread *newtd) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; td->td_pflags &= ~TDP_OWEPREEMPT; /* * If the KSE has been assigned it may be in the process of switching * to the new cpu. This is the case in sched_bind(). */ if ((ke->ke_flags & KEF_ASSIGNED) == 0) { if (td == PCPU_GET(idlethread)) { TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); /* * Don't allow the kse to migrate from a preemption. */ ke->ke_flags |= KEF_HOLD; setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING); } else { if (ke->ke_runq) { kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); } else if ((td->td_flags & TDF_IDLETD) == 0) kdb_backtrace(); /* * We will not be on the run queue. So we must be * sleeping or similar. */ if (td->td_proc->p_flag & P_SA) kse_reassign(ke); } } if (newtd != NULL) kseq_load_add(KSEQ_SELF(), newtd->td_kse); else newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); } void sched_nice(struct proc *p, int nice) { struct ksegrp *kg; struct kse *ke; struct thread *td; struct kseq *kseq; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); /* * We need to adjust the nice counts for running KSEs. */ FOREACH_KSEGRP_IN_PROC(p, kg) { if (kg->kg_pri_class == PRI_TIMESHARE) { FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_runq == NULL) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq_nice_rem(kseq, p->p_nice); kseq_nice_add(kseq, nice); } } } p->p_nice = nice; FOREACH_KSEGRP_IN_PROC(p, kg) { sched_priority(kg); FOREACH_THREAD_IN_GROUP(kg, td) td->td_flags |= TDF_NEEDRESCHED; } } void sched_sleep(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_base_pri = td->td_priority; CTR2(KTR_ULE, "sleep kse %p (tick: %d)", td->td_kse, td->td_slptime); } void sched_wakeup(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ if (td->td_slptime) { struct ksegrp *kg; int hzticks; kg = td->td_ksegrp; hzticks = (ticks - td->td_slptime) << 10; if (hzticks >= SCHED_SLP_RUN_MAX) { kg->kg_slptime = SCHED_SLP_RUN_MAX; kg->kg_runtime = 1; } else { kg->kg_slptime += hzticks; sched_interact_update(kg); } sched_priority(kg); if (td->td_kse) sched_slice(td->td_kse); CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", td->td_kse, hzticks); td->td_slptime = 0; } setrunqueue(td, SRQ_BORING); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct proc *p1) { mtx_assert(&sched_lock, MA_OWNED); p1->p_nice = td->td_proc->p_nice; sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1)); sched_fork_kse(td, FIRST_KSE_IN_PROC(p1)); sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct thread *td, struct kse *child) { struct kse *ke = td->td_kse; child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ child->ke_cpu = ke->ke_cpu; child->ke_runq = NULL; /* Grab our parents cpu estimation information. */ child->ke_ticks = ke->ke_ticks; child->ke_ltick = ke->ke_ltick; child->ke_ftick = ke->ke_ftick; } void sched_fork_ksegrp(struct thread *td, struct ksegrp *child) { struct ksegrp *kg = td->td_ksegrp; PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); child->kg_slptime = kg->kg_slptime; child->kg_runtime = kg->kg_runtime; child->kg_user_pri = kg->kg_user_pri; sched_interact_fork(child); kg->kg_runtime += tickincr << 10; sched_interact_update(kg); CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_class(struct ksegrp *kg, int class) { struct kseq *kseq; struct kse *ke; int nclass; int oclass; mtx_assert(&sched_lock, MA_OWNED); if (kg->kg_pri_class == class) return; nclass = PRI_BASE(class); oclass = PRI_BASE(kg->kg_pri_class); FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); #ifdef SMP /* * On SMP if we're on the RUNQ we must adjust the transferable * count because could be changing to or from an interrupt * class. */ if (ke->ke_state == KES_ONRUNQ) { if (KSE_CAN_MIGRATE(ke, oclass)) { kseq->ksq_transferable--; kseq->ksq_group->ksg_transferable--; } if (KSE_CAN_MIGRATE(ke, nclass)) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; } } #endif if (oclass == PRI_TIMESHARE) { kseq->ksq_load_timeshare--; kseq_nice_rem(kseq, kg->kg_proc->p_nice); } if (nclass == PRI_TIMESHARE) { kseq->ksq_load_timeshare++; kseq_nice_add(kseq, kg->kg_proc->p_nice); } } kg->kg_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); sched_exit_kse(FIRST_KSE_IN_PROC(p), td); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); } void sched_exit_kse(struct kse *ke, struct thread *td) { kseq_load_rem(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse); } void sched_exit_ksegrp(struct ksegrp *kg, struct thread *td) { /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ kg->kg_runtime += td->td_ksegrp->kg_runtime; sched_interact_update(kg); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_clock(struct thread *td) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kseq = KSEQ_SELF(); #ifdef SMP if (ticks == bal_tick) sched_balance(); if (ticks == gbal_tick) sched_balance_groups(); /* * We could have been assigned a non real-time thread without an * IPI. */ if (kseq->ksq_assigned) kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ #endif /* * sched_setup() apparently happens prior to stathz being set. We * need to resolve the timers earlier in the boot so we can avoid * calculating this here. */ if (realstathz == 0) { realstathz = stathz ? stathz : hz; tickincr = hz / realstathz; /* * XXX This does not work for values of stathz that are much * larger than hz. */ if (tickincr == 0) tickincr = 1; } ke = td->td_kse; kg = ke->ke_ksegrp; /* Adjust ticks for pctcpu */ ke->ke_ticks++; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_flags & TDF_IDLETD) return; CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); /* * We only do slicing code for TIMESHARE ksegrps. */ if (kg->kg_pri_class != PRI_TIMESHARE) return; /* * We used a tick charge it to the ksegrp so that we can compute our * interactivity. */ kg->kg_runtime += tickincr << 10; sched_interact_update(kg); /* * We used up one time slice. */ if (--ke->ke_slice > 0) return; /* * We're out of time, recompute priorities and requeue. */ kseq_load_rem(kseq, ke); sched_priority(kg); sched_slice(ke); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; kseq_load_add(kseq, ke); td->td_flags |= TDF_NEEDRESCHED; } int sched_runnable(void) { struct kseq *kseq; int load; load = 1; kseq = KSEQ_SELF(); #ifdef SMP if (kseq->ksq_assigned) { mtx_lock_spin(&sched_lock); kseq_assign(kseq); mtx_unlock_spin(&sched_lock); } #endif if ((curthread->td_flags & TDF_IDLETD) != 0) { if (kseq->ksq_load > 0) goto out; } else if (kseq->ksq_load - 1 > 0) goto out; load = 0; out: return (load); } void sched_userret(struct thread *td) { struct ksegrp *kg; kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } struct kse * sched_choose(void) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); kseq = KSEQ_SELF(); #ifdef SMP restart: if (kseq->ksq_assigned) kseq_assign(kseq); #endif ke = kseq_choose(kseq); if (ke) { #ifdef SMP if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) if (kseq_idled(kseq) == 0) goto restart; #endif kseq_runq_rem(kseq, ke); ke->ke_state = KES_THREAD; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority); } return (ke); } #ifdef SMP if (kseq_idled(kseq) == 0) goto restart; #endif return (NULL); } void sched_add(struct thread *td, int flags) { /* let jeff work out how to map the flags better */ /* I'm open to suggestions */ if (flags & SRQ_YIELDING) /* * Preempting during switching can be bad JUJU * especially for KSE processes */ sched_add_internal(td, 0); else sched_add_internal(td, 1); } static void sched_add_internal(struct thread *td, int preemptive) { struct kseq *kseq; struct ksegrp *kg; struct kse *ke; #ifdef SMP int canmigrate; #endif int class; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; kg = td->td_ksegrp; if (ke->ke_flags & KEF_ASSIGNED) return; kseq = KSEQ_SELF(); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); class = PRI_BASE(kg->kg_pri_class); switch (class) { case PRI_ITHD: case PRI_REALTIME: ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; break; case PRI_IDLE: /* * This is for priority prop. */ if (ke->ke_thread->td_priority < PRI_MIN_IDLE) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = &kseq->ksq_idle; ke->ke_slice = SCHED_SLICE_MIN; break; default: panic("Unknown pri class."); break; } #ifdef SMP /* * Don't migrate running threads here. Force the long term balancer * to do it. */ canmigrate = KSE_CAN_MIGRATE(ke, class); if (ke->ke_flags & KEF_HOLD) { ke->ke_flags &= ~KEF_HOLD; canmigrate = 0; } /* * If this thread is pinned or bound, notify the target cpu. */ if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { ke->ke_runq = NULL; kseq_notify(ke, ke->ke_cpu); return; } /* * If we had been idle, clear our bit in the group and potentially * the global bitmap. If not, see if we should transfer this thread. */ if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { /* * Check to see if our group is unidling, and if so, remove it * from the global idle mask. */ if (kseq->ksq_group->ksg_idlemask == kseq->ksq_group->ksg_cpumask) atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); /* * Now remove ourselves from the group specific idle mask. */ kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); } else if (kseq->ksq_load > 1 && canmigrate) if (kseq_transfer(kseq, ke, class)) return; ke->ke_cpu = PCPU_GET(cpuid); #endif /* * XXX With preemption this is not necessary. */ if (td->td_priority < curthread->td_priority && ke->ke_runq == kseq->ksq_curr) curthread->td_flags |= TDF_NEEDRESCHED; if (preemptive && maybe_preempt(td)) return; ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; kseq_runq_add(kseq, ke); kseq_load_add(kseq, ke); } void sched_rem(struct thread *td) { struct kseq *kseq; struct kse *ke; ke = td->td_kse; /* * It is safe to just return here because sched_rem() is only ever * used in places where we're immediately going to add the * kse back on again. In that case it'll be added with the correct * thread and priority when the caller drops the sched_lock. */ if (ke->ke_flags & KEF_ASSIGNED) return; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_ONRUNQ), ("sched_rem: KSE not on run queue")); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq = KSEQ_CPU(ke->ke_cpu); kseq_runq_rem(kseq, ke); kseq_load_rem(kseq, ke); } fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct kse *ke; pctcpu = 0; ke = td->td_kse; if (ke == NULL) return (0); mtx_lock_spin(&sched_lock); if (ke->ke_ticks) { int rtick; /* * Don't update more frequently than twice a second. Allowing * this causes the cpu usage to decay away too quickly due to * rounding errors. */ if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || ke->ke_ltick < (ticks - (hz / 2))) sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; mtx_unlock_spin(&sched_lock); return (pctcpu); } void sched_bind(struct thread *td, int cpu) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; ke->ke_flags |= KEF_BOUND; #ifdef SMP if (PCPU_GET(cpuid) == cpu) return; /* sched_rem without the runq_remove */ ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); kseq_notify(ke, cpu); /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); #endif } void sched_unbind(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_kse->ke_flags &= ~KEF_BOUND; } int sched_load(void) { #ifdef SMP int total; int i; total = 0; for (i = 0; i <= ksg_maxid; i++) total += KSEQ_GROUP(i)->ksg_load; return (total); #else return (KSEQ_SELF()->ksq_sysload); #endif } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c index 6e50b6b581b1..9d779de097f2 100644 --- a/sys/vm/vm_zeroidle.c +++ b/sys/vm/vm_zeroidle.c @@ -1,193 +1,195 @@ /*- * Copyright (c) 1994 John Dyson * Copyright (c) 2001 Matt Dillon * * All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * from: FreeBSD: .../i386/vm_machdep.c,v 1.165 2001/07/04 23:27:04 dillon */ #include __FBSDID("$FreeBSD$"); +#include + #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vm_stats_misc); static int cnt_prezero; SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, ""); static int idlezero_enable_default = 1; TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default); /* Defer setting the enable flag until the kthread is running. */ static int idlezero_enable = 0; SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, ""); static int idlezero_maxrun = 16; SYSCTL_INT(_vm, OID_AUTO, idlezero_maxrun, CTLFLAG_RW, &idlezero_maxrun, 0, ""); TUNABLE_INT("vm.idlezero_maxrun", &idlezero_maxrun); /* * Implement the pre-zeroed page mechanism. */ #define ZIDLE_LO(v) ((v) * 2 / 3) #define ZIDLE_HI(v) ((v) * 4 / 5) static int zero_state; static int vm_page_zero_check(void) { if (!idlezero_enable) return (0); /* * Attempt to maintain approximately 1/2 of our free pages in a * PG_ZERO'd state. Add some hysteresis to (attempt to) avoid * generally zeroing a page when the system is near steady-state. * Otherwise we might get 'flutter' during disk I/O / IPC or * fast sleeps. We also do not want to be continuously zeroing * pages because doing so may flush our L1 and L2 caches too much. */ if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count)) return (0); if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) return (0); return (1); } static int vm_page_zero_idle(void) { static int free_rover; vm_page_t m; mtx_lock_spin(&vm_page_queue_free_mtx); zero_state = 0; m = vm_pageq_find(PQ_FREE, free_rover, FALSE); if (m != NULL && (m->flags & PG_ZERO) == 0) { vm_pageq_remove_nowakeup(m); mtx_unlock_spin(&vm_page_queue_free_mtx); pmap_zero_page_idle(m); mtx_lock_spin(&vm_page_queue_free_mtx); m->flags |= PG_ZERO; vm_pageq_enqueue(PQ_FREE + m->pc, m); ++vm_page_zero_count; ++cnt_prezero; if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) zero_state = 1; } free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; mtx_unlock_spin(&vm_page_queue_free_mtx); return (1); } /* Called by vm_page_free to hint that a new page is available. */ void vm_page_zero_idle_wakeup(void) { if (idlezero_enable && vm_page_zero_check()) wakeup(&zero_state); } static void vm_pagezero(void __unused *arg) { struct proc *p; struct rtprio rtp; struct thread *td; int pages, pri; td = curthread; p = td->td_proc; rtp.prio = RTP_PRIO_MAX; rtp.type = RTP_PRIO_IDLE; pages = 0; mtx_lock_spin(&sched_lock); rtp_to_pri(&rtp, td->td_ksegrp); pri = td->td_priority; mtx_unlock_spin(&sched_lock); idlezero_enable = idlezero_enable_default; for (;;) { if (vm_page_zero_check()) { pages += vm_page_zero_idle(); #ifndef PREEMPTION if (pages > idlezero_maxrun || sched_runnable()) { mtx_lock_spin(&sched_lock); mi_switch(SW_VOL, NULL); mtx_unlock_spin(&sched_lock); pages = 0; } #endif } else { tsleep(&zero_state, pri, "pgzero", hz * 300); pages = 0; } } } static struct proc *pagezero_proc; static void pagezero_start(void __unused *arg) { int error; error = kthread_create(vm_pagezero, NULL, &pagezero_proc, RFSTOPPED, 0, "pagezero"); if (error) panic("pagezero_start: error %d\n", error); /* * We're an idle task, don't count us in the load. */ PROC_LOCK(pagezero_proc); pagezero_proc->p_flag |= P_NOLOAD; PROC_UNLOCK(pagezero_proc); mtx_lock_spin(&sched_lock); setrunqueue(FIRST_THREAD_IN_PROC(pagezero_proc), SRQ_BORING); mtx_unlock_spin(&sched_lock); } SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)