Index: head/sys/conf/NOTES =================================================================== --- head/sys/conf/NOTES (revision 297747) +++ head/sys/conf/NOTES (revision 297748) @@ -1,3036 +1,3044 @@ # $FreeBSD$ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers', # 'makeoptions', 'hints', etc. go into the kernel configuration that you # run config(8) with. # # Lines that begin with 'hint.' are NOT for config(8), they go into your # hints file. See /boot/device.hints and/or the 'hints' config(8) directive. # # Please use ``make LINT'' to create an old-style LINT file if you want to # do kernel test-builds. # # This file contains machine independent kernel configuration notes. For # machine dependent notes, look in /sys//conf/NOTES. # # # NOTES conventions and style guide: # # Large block comments should begin and end with a line containing only a # comment character. # # To describe a particular object, a block comment (if it exists) should # come first. Next should come device, options, and hints lines in that # order. All device and option lines must be described by a comment that # doesn't just expand the device or option name. Use only a concise # comment on the same line if possible. Very detailed descriptions of # devices and subsystems belong in man pages. # # A space followed by a tab separates 'options' from an option name. Two # spaces followed by a tab separate 'device' from a device name. Comments # after an option or device should use one space after the comment character. # To comment out a negative option that disables code and thus should not be # enabled for LINT builds, precede 'options' with "#!". # # # This is the ``identification'' of the kernel. Usually this should # be the same as the name of your kernel. # ident LINT # # The `maxusers' parameter controls the static sizing of a number of # internal system tables by a formula defined in subr_param.c. # Omitting this parameter or setting it to 0 will cause the system to # auto-size based on physical memory. # maxusers 10 # To statically compile in device wiring instead of /boot/device.hints #hints "LINT.hints" # Default places to look for devices. # Use the following to compile in values accessible to the kernel # through getenv() (or kenv(1) in userland). The format of the file # is 'variable=value', see kenv(1) # #env "LINT.env" # # The `makeoptions' parameter allows variables to be passed to the # generated Makefile in the build area. # # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS} # after most other flags. Here we use it to inhibit use of non-optimal # gcc built-in functions (e.g., memcmp). # # DEBUG happens to be magic. # The following is equivalent to 'config -g KERNELNAME' and creates # 'kernel.debug' compiled with -g debugging as well as a normal # 'kernel'. Use 'make install.debug' to install the debug kernel # but that isn't normally necessary as the debug symbols are not loaded # by the kernel and are not useful there anyway. # # KERNEL can be overridden so that you can change the default name of your # kernel. # # MODULES_OVERRIDE can be used to limit modules built to a specific list. # makeoptions CONF_CFLAGS=-fno-builtin #Don't allow use of memcmp, etc. #makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols #makeoptions KERNEL=foo #Build kernel "foo" and install "/foo" # Only build ext2fs module plus those parts of the sound system I need. #makeoptions MODULES_OVERRIDE="ext2fs sound/sound sound/driver/maestro3" makeoptions DESTDIR=/tmp # # FreeBSD processes are subject to certain limits to their consumption # of system resources. See getrlimit(2) for more details. Each # resource limit has two values, a "soft" limit and a "hard" limit. # The soft limits can be modified during normal system operation, but # the hard limits are set at boot time. Their default values are # in sys//include/vmparam.h. There are two ways to change them: # # 1. Set the values at kernel build time. The options below are one # way to allow that limit to grow to 1GB. They can be increased # further by changing the parameters: # # 2. In /boot/loader.conf, set the tunables kern.maxswzone, # kern.maxbcache, kern.maxtsiz, kern.dfldsiz, kern.maxdsiz, # kern.dflssiz, kern.maxssiz and kern.sgrowsiz. # # The options in /boot/loader.conf override anything in the kernel # configuration file. See the function init_param1 in # sys/kern/subr_param.c for more details. # options MAXDSIZ=(1024UL*1024*1024) options MAXSSIZ=(128UL*1024*1024) options DFLDSIZ=(1024UL*1024*1024) # # BLKDEV_IOSIZE sets the default block size used in user block # device I/O. Note that this value will be overridden by the label # when specifying a block device from a label with a non-0 # partition blocksize. The default is PAGE_SIZE. # options BLKDEV_IOSIZE=8192 # # MAXPHYS and DFLTPHYS # # These are the maximal and safe 'raw' I/O block device access sizes. # Reads and writes will be split into MAXPHYS chunks for known good # devices and DFLTPHYS for the rest. Some applications have better # performance with larger raw I/O access sizes. Note that certain VM # parameters are derived from these values and making them too large # can make an unbootable kernel. # # The defaults are 64K and 128K respectively. options DFLTPHYS=(64*1024) options MAXPHYS=(128*1024) # This allows you to actually store this configuration file into # the kernel binary itself. See config(8) for more details. # options INCLUDE_CONFIG_FILE # Include this file in kernel # # Compile-time defaults for various boot parameters # options BOOTVERBOSE=1 options BOOTHOWTO=RB_MULTIPLE options GEOM_AES # Don't use, use GEOM_BDE options GEOM_BDE # Disk encryption. options GEOM_BSD # BSD disklabels options GEOM_CACHE # Disk cache. options GEOM_CONCAT # Disk concatenation. options GEOM_ELI # Disk encryption. options GEOM_FOX # Redundant path mitigation options GEOM_GATE # Userland services. options GEOM_JOURNAL # Journaling. options GEOM_LABEL # Providers labelization. options GEOM_LINUX_LVM # Linux LVM2 volumes options GEOM_MAP # Map based partitioning options GEOM_MBR # DOS/MBR partitioning options GEOM_MIRROR # Disk mirroring. options GEOM_MULTIPATH # Disk multipath options GEOM_NOP # Test class. options GEOM_PART_APM # Apple partitioning options GEOM_PART_BSD # BSD disklabel options GEOM_PART_BSD64 # BSD disklabel64 options GEOM_PART_EBR # Extended Boot Records options GEOM_PART_EBR_COMPAT # Backward compatible partition names options GEOM_PART_GPT # GPT partitioning options GEOM_PART_LDM # Logical Disk Manager options GEOM_PART_MBR # MBR partitioning options GEOM_PART_PC98 # PC-9800 disk partitioning options GEOM_PART_VTOC8 # SMI VTOC8 disk label options GEOM_PC98 # NEC PC9800 partitioning options GEOM_RAID # Soft RAID functionality. options GEOM_RAID3 # RAID3 functionality. options GEOM_SHSEC # Shared secret. options GEOM_STRIPE # Disk striping. options GEOM_SUNLABEL # Sun/Solaris partitioning options GEOM_UZIP # Read-only compressed disks options GEOM_VINUM # Vinum logical volume manager options GEOM_VIRSTOR # Virtual storage. options GEOM_VOL # Volume names from UFS superblock options GEOM_ZERO # Performance testing helper. # # The root device and filesystem type can be compiled in; # this provides a fallback option if the root device cannot # be correctly guessed by the bootstrap code, or an override if # the RB_DFLTROOT flag (-r) is specified when booting the kernel. # options ROOTDEVNAME=\"ufs:da0s2e\" ##################################################################### # Scheduler options: # # Specifying one of SCHED_4BSD or SCHED_ULE is mandatory. These options # select which scheduler is compiled in. # # SCHED_4BSD is the historical, proven, BSD scheduler. It has a global run # queue and no CPU affinity which makes it suboptimal for SMP. It has very # good interactivity and priority selection. # # SCHED_ULE provides significant performance advantages over 4BSD on many # workloads on SMP machines. It supports cpu-affinity, per-cpu runqueues # and scheduler locks. It also has a stronger notion of interactivity # which leads to better responsiveness even on uniprocessor machines. This # is the default scheduler. # # SCHED_STATS is a debugging option which keeps some stats in the sysctl # tree at 'kern.sched.stats' and is useful for debugging scheduling decisions. # options SCHED_4BSD options SCHED_STATS #options SCHED_ULE ##################################################################### # SMP OPTIONS: # # SMP enables building of a Symmetric MultiProcessor Kernel. # Mandatory: options SMP # Symmetric MultiProcessor Kernel # MAXCPU defines the maximum number of CPUs that can boot in the system. # A default value should be already present, for every architecture. options MAXCPU=32 # MAXMEMDOM defines the maximum number of memory domains that can boot in the # system. A default value should already be defined by every architecture. -options MAXMEMDOM=1 +options MAXMEMDOM=2 + +# VM_NUMA_ALLOC enables use of memory domain-aware allocation in the VM +# system. +options VM_NUMA_ALLOC + +# DEVICE_NUMA enables reporting of domain affinity of I/O devices via +# bus_get_domain(), etc. +options DEVICE_NUMA # ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin # if the thread that currently owns the mutex is executing on another # CPU. This behavior is enabled by default, so this option can be used # to disable it. options NO_ADAPTIVE_MUTEXES # ADAPTIVE_RWLOCKS changes the behavior of reader/writer locks to spin # if the thread that currently owns the rwlock is executing on another # CPU. This behavior is enabled by default, so this option can be used # to disable it. options NO_ADAPTIVE_RWLOCKS # ADAPTIVE_SX changes the behavior of sx locks to spin if the thread that # currently owns the sx lock is executing on another CPU. # This behavior is enabled by default, so this option can be used to # disable it. options NO_ADAPTIVE_SX # MUTEX_NOINLINE forces mutex operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options MUTEX_NOINLINE # RWLOCK_NOINLINE forces rwlock operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options RWLOCK_NOINLINE # SX_NOINLINE forces sx lock operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options SX_NOINLINE # SMP Debugging Options: # # CALLOUT_PROFILING enables rudimentary profiling of the callwheel data # structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. # WARNING! Only tested on amd64 and i386. # FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel # threads. Its sole use is to expose race conditions and other # bugs during development. Enabling this option will reduce # performance and increase the frequency of kernel panics by # design. If you aren't sure that you need it then you don't. # Relies on the PREEMPTION option. DON'T TURN THIS ON. # MUTEX_DEBUG enables various extra assertions in the mutex code. # SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table # used to hold active sleep queues as well as sleep wait message # frequency. # TURNSTILE_PROFILING enables rudimentary profiling of the hash table # used to hold active lock queues. # UMTX_PROFILING enables rudimentary profiling of the hash table used to hold active lock queues. # WITNESS enables the witness code which detects deadlocks and cycles # during locking operations. # WITNESS_KDB causes the witness code to drop into the kernel debugger if # a lock hierarchy violation occurs or if locks are held when going to # sleep. # WITNESS_SKIPSPIN disables the witness checks on spin mutexes. options PREEMPTION options FULL_PREEMPTION options MUTEX_DEBUG options WITNESS options WITNESS_KDB options WITNESS_SKIPSPIN # LOCK_PROFILING - Profiling locks. See LOCK_PROFILING(9) for details. options LOCK_PROFILING # Set the number of buffers and the hash size. The hash size MUST be larger # than the number of buffers. Hash size should be prime. options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" # Profiling for the callout(9) backend. options CALLOUT_PROFILING # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING options UMTX_PROFILING ##################################################################### # COMPATIBILITY OPTIONS # # Implement system calls compatible with 4.3BSD and older versions of # FreeBSD. You probably do NOT want to remove this as much current code # still relies on the 4.3 emulation. Note that some architectures that # are supported by FreeBSD do not include support for certain important # aspects of this compatibility option, namely those related to the # signal delivery mechanism. # options COMPAT_43 # Old tty interface. options COMPAT_43TTY # Note that as a general rule, COMPAT_FREEBSD depends on # COMPAT_FREEBSD, COMPAT_FREEBSD, etc. # Enable FreeBSD4 compatibility syscalls options COMPAT_FREEBSD4 # Enable FreeBSD5 compatibility syscalls options COMPAT_FREEBSD5 # Enable FreeBSD6 compatibility syscalls options COMPAT_FREEBSD6 # Enable FreeBSD7 compatibility syscalls options COMPAT_FREEBSD7 # Enable FreeBSD9 compatibility syscalls options COMPAT_FREEBSD9 # Enable FreeBSD10 compatibility syscalls options COMPAT_FREEBSD10 # Enable Linux Kernel Programming Interface options COMPAT_LINUXKPI # # These three options provide support for System V Interface # Definition-style interprocess communication, in the form of shared # memory, semaphores, and message queues, respectively. # options SYSVSHM options SYSVSEM options SYSVMSG ##################################################################### # DEBUGGING OPTIONS # # Compile with kernel debugger related code. # options KDB # # Print a stack trace of the current thread on the console for a panic. # options KDB_TRACE # # Don't enter the debugger for a panic. Intended for unattended operation # where you may want to enter the debugger from the console, but still want # the machine to recover from a panic. # options KDB_UNATTENDED # # Enable the ddb debugger backend. # options DDB # # Print the numerical value of symbols in addition to the symbolic # representation. # options DDB_NUMSYM # # Enable the remote gdb debugger backend. # options GDB # # SYSCTL_DEBUG enables a 'sysctl' debug tree that can be used to dump the # contents of the registered sysctl nodes on the console. It is disabled by # default because it generates excessively verbose console output that can # interfere with serial console operation. # options SYSCTL_DEBUG # # Enable textdump by default, this disables kernel core dumps. # options TEXTDUMP_PREFERRED # # Enable extra debug messages while performing textdumps. # options TEXTDUMP_VERBOSE # # NO_SYSCTL_DESCR omits the sysctl node descriptions to save space in the # resulting kernel. options NO_SYSCTL_DESCR # # MALLOC_DEBUG_MAXZONES enables multiple uma zones for malloc(9) # allocations that are smaller than a page. The purpose is to isolate # different malloc types into hash classes, so that any buffer # overruns or use-after-free will usually only affect memory from # malloc types in that hash class. This is purely a debugging tool; # by varying the hash function and tracking which hash class was # corrupted, the intersection of the hash classes from each instance # will point to a single malloc type that is being misused. At this # point inspection or memguard(9) can be used to catch the offending # code. # options MALLOC_DEBUG_MAXZONES=8 # # DEBUG_MEMGUARD builds and enables memguard(9), a replacement allocator # for the kernel used to detect modify-after-free scenarios. See the # memguard(9) man page for more information on usage. # options DEBUG_MEMGUARD # # DEBUG_REDZONE enables buffer underflows and buffer overflows detection for # malloc(9). # options DEBUG_REDZONE # # EARLY_PRINTF enables support for calling a special printf (eprintf) # very early in the kernel (before cn_init() has been called). This # should only be used for debugging purposes early in boot. Normally, # it is not defined. It is commented out here because this feature # isn't generally available. And the required eputc() isn't defined. # #options EARLY_PRINTF # # KTRACE enables the system-call tracing facility ktrace(2). To be more # SMP-friendly, KTRACE uses a worker thread to process most trace events # asynchronously to the thread generating the event. This requires a # pre-allocated store of objects representing trace events. The # KTRACE_REQUEST_POOL option specifies the initial size of this store. # The size of the pool can be adjusted both at boottime and runtime via # the kern.ktrace_request_pool tunable and sysctl. # options KTRACE #kernel tracing options KTRACE_REQUEST_POOL=101 # # KTR is a kernel tracing facility imported from BSD/OS. It is # enabled with the KTR option. KTR_ENTRIES defines the number of # entries in the circular trace buffer; it may be an arbitrary number. # KTR_BOOT_ENTRIES defines the number of entries during the early boot, # before malloc(9) is functional. # KTR_COMPILE defines the mask of events to compile into the kernel as # defined by the KTR_* constants in . KTR_MASK defines the # initial value of the ktr_mask variable which determines at runtime # what events to trace. KTR_CPUMASK determines which CPU's log # events, with bit X corresponding to CPU X. The layout of the string # passed as KTR_CPUMASK must match a series of bitmasks each of them # separated by the "," character (ie: # KTR_CPUMASK=0xAF,0xFFFFFFFFFFFFFFFF). KTR_VERBOSE enables # dumping of KTR events to the console by default. This functionality # can be toggled via the debug.ktr_verbose sysctl and defaults to off # if KTR_VERBOSE is not defined. See ktr(4) and ktrdump(8) for details. # options KTR options KTR_BOOT_ENTRIES=1024 options KTR_ENTRIES=(128*1024) options KTR_COMPILE=(KTR_ALL) options KTR_MASK=KTR_INTR options KTR_CPUMASK=0x3 options KTR_VERBOSE # # ALQ(9) is a facility for the asynchronous queuing of records from the kernel # to a vnode, and is employed by services such as ktr(4) to produce trace # files based on a kernel event stream. Records are written asynchronously # in a worker thread. # options ALQ options KTR_ALQ # # The INVARIANTS option is used in a number of source files to enable # extra sanity checking of internal structures. This support is not # enabled by default because of the extra time it would take to check # for these conditions, which can only occur as a result of # programming errors. # options INVARIANTS # # The INVARIANT_SUPPORT option makes us compile in support for # verifying some of the internal structures. It is a prerequisite for # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be # called. The intent is that you can set 'INVARIANTS' for single # source files (by changing the source file or specifying it on the # command line) if you have 'INVARIANT_SUPPORT' enabled. Also, if you # wish to build a kernel module with 'INVARIANTS', then adding # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary # infrastructure without the added overhead. # options INVARIANT_SUPPORT # # The DIAGNOSTIC option is used to enable extra debugging information # from some parts of the kernel. As this makes everything more noisy, # it is disabled by default. # options DIAGNOSTIC # # REGRESSION causes optional kernel interfaces necessary only for regression # testing to be enabled. These interfaces may constitute security risks # when enabled, as they permit processes to easily modify aspects of the # run-time environment to reproduce unlikely or unusual (possibly normally # impossible) scenarios. # options REGRESSION # # This option lets some drivers co-exist that can't co-exist in a running # system. This is used to be able to compile all kernel code in one go for # quality assurance purposes (like this file, which the option takes it name # from.) # options COMPILING_LINT # # STACK enables the stack(9) facility, allowing the capture of kernel stack # for the purpose of procinfo(1), etc. stack(9) will also be compiled in # automatically if DDB(4) is compiled into the kernel. # options STACK ##################################################################### # PERFORMANCE MONITORING OPTIONS # # The hwpmc driver that allows the use of in-CPU performance monitoring # counters for performance monitoring. The base kernel needs to be configured # with the 'options' line, while the hwpmc device can be either compiled # in or loaded as a loadable kernel module. # # Additional configuration options may be required on specific architectures, # please see hwpmc(4). device hwpmc # Driver (also a loadable module) options HWPMC_DEBUG options HWPMC_HOOKS # Other necessary kernel hooks ##################################################################### # NETWORKING OPTIONS # # Protocol families # options INET #Internet communications protocols options INET6 #IPv6 communications protocols options ROUTETABLES=2 # allocated fibs up to 65536. default is 1. # but that would be a bad idea as they are large. options TCP_OFFLOAD # TCP offload support. # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration options IPSEC #IP security (requires device crypto) #options IPSEC_DEBUG #debug for IP security # # #DEPRECATED# # Set IPSEC_FILTERTUNNEL to change the default of the sysctl to force packets # coming through a tunnel to be processed by any configured packet filtering # twice. The default is that packets coming out of a tunnel are _not_ processed; # they are assumed trusted. # # IPSEC history is preserved for such packets, and can be filtered # using ipfw(8)'s 'ipsec' keyword, when this option is enabled. # #options IPSEC_FILTERTUNNEL #filter ipsec packets from a tunnel # # Set IPSEC_NAT_T to enable NAT-Traversal support. This enables # optional UDP encapsulation of ESP packets. # options IPSEC_NAT_T #NAT-T support, UDP encap of ESP # # SMB/CIFS requester # NETSMB enables support for SMB protocol, it requires LIBMCHAIN and LIBICONV # options. options NETSMB #SMB/CIFS requester # mchain library. It can be either loaded as KLD or compiled into kernel options LIBMCHAIN # libalias library, performing NAT options LIBALIAS # flowtable cache options FLOWTABLE # # SCTP is a NEW transport protocol defined by # RFC2960 updated by RFC3309 and RFC3758.. and # soon to have a new base RFC and many many more # extensions. This release supports all the extensions # including many drafts (most about to become RFC's). # It is the reference implementation of SCTP # and is quite well tested. # # Note YOU MUST have both INET and INET6 defined. # You don't have to enable V6, but SCTP is # dual stacked and so far we have not torn apart # the V6 and V4.. since an association can span # both a V6 and V4 address at the SAME time :-) # options SCTP # There are bunches of options: # this one turns on all sorts of # nastily printing that you can # do. It's all controlled by a # bit mask (settable by socket opt and # by sysctl). Including will not cause # logging until you set the bits.. but it # can be quite verbose.. so without this # option we don't do any of the tests for # bits and prints.. which makes the code run # faster.. if you are not debugging don't use. options SCTP_DEBUG # # This option turns off the CRC32c checksum. Basically, # you will not be able to talk to anyone else who # has not done this. Its more for experimentation to # see how much CPU the CRC32c really takes. Most new # cards for TCP support checksum offload.. so this # option gives you a "view" into what SCTP would be # like with such an offload (which only exists in # high in iSCSI boards so far). With the new # splitting 8's algorithm its not as bad as it used # to be.. but it does speed things up try only # for in a captured lab environment :-) options SCTP_WITH_NO_CSUM # # # All that options after that turn on specific types of # logging. You can monitor CWND growth, flight size # and all sorts of things. Go look at the code and # see. I have used this to produce interesting # charts and graphs as well :-> # # I have not yet committed the tools to get and print # the logs, I will do that eventually .. before then # if you want them send me an email rrs@freebsd.org # You basically must have ktr(4) enabled for these # and you then set the sysctl to turn on/off various # logging bits. Use ktrdump(8) to pull the log and run # it through a display program.. and graphs and other # things too. # options SCTP_LOCK_LOGGING options SCTP_MBUF_LOGGING options SCTP_MBCNT_LOGGING options SCTP_PACKET_LOGGING options SCTP_LTRACE_CHUNKS options SCTP_LTRACE_ERRORS # altq(9). Enable the base part of the hooks with the ALTQ option. # Individual disciplines must be built into the base system and can not be # loaded as modules at this point. ALTQ requires a stable TSC so if yours is # broken or changes with CPU throttling then you must also have the ALTQ_NOPCC # option. options ALTQ options ALTQ_CBQ # Class Based Queueing options ALTQ_RED # Random Early Detection options ALTQ_RIO # RED In/Out options ALTQ_HFSC # Hierarchical Packet Scheduler options ALTQ_FAIRQ # Fair Packet Scheduler options ALTQ_CDNR # Traffic conditioner options ALTQ_PRIQ # Priority Queueing options ALTQ_NOPCC # Required if the TSC is unusable options ALTQ_DEBUG # netgraph(4). Enable the base netgraph code with the NETGRAPH option. # Individual node types can be enabled with the corresponding option # listed below; however, this is not strictly necessary as netgraph # will automatically load the corresponding KLD module if the node type # is not already compiled into the kernel. Each type below has a # corresponding man page, e.g., ng_async(8). options NETGRAPH # netgraph(4) system options NETGRAPH_DEBUG # enable extra debugging, this # affects netgraph(4) and nodes # Node types options NETGRAPH_ASYNC options NETGRAPH_ATMLLC options NETGRAPH_ATM_ATMPIF options NETGRAPH_BLUETOOTH # ng_bluetooth(4) options NETGRAPH_BLUETOOTH_BT3C # ng_bt3c(4) options NETGRAPH_BLUETOOTH_HCI # ng_hci(4) options NETGRAPH_BLUETOOTH_L2CAP # ng_l2cap(4) options NETGRAPH_BLUETOOTH_SOCKET # ng_btsocket(4) options NETGRAPH_BLUETOOTH_UBT # ng_ubt(4) options NETGRAPH_BLUETOOTH_UBTBCMFW # ubtbcmfw(4) options NETGRAPH_BPF options NETGRAPH_BRIDGE options NETGRAPH_CAR options NETGRAPH_CISCO options NETGRAPH_DEFLATE options NETGRAPH_DEVICE options NETGRAPH_ECHO options NETGRAPH_EIFACE options NETGRAPH_ETHER options NETGRAPH_FRAME_RELAY options NETGRAPH_GIF options NETGRAPH_GIF_DEMUX options NETGRAPH_HOLE options NETGRAPH_IFACE options NETGRAPH_IP_INPUT options NETGRAPH_IPFW options NETGRAPH_KSOCKET options NETGRAPH_L2TP options NETGRAPH_LMI # MPPC compression requires proprietary files (not included) #options NETGRAPH_MPPC_COMPRESSION options NETGRAPH_MPPC_ENCRYPTION options NETGRAPH_NETFLOW options NETGRAPH_NAT options NETGRAPH_ONE2MANY options NETGRAPH_PATCH options NETGRAPH_PIPE options NETGRAPH_PPP options NETGRAPH_PPPOE options NETGRAPH_PPTPGRE options NETGRAPH_PRED1 options NETGRAPH_RFC1490 options NETGRAPH_SOCKET options NETGRAPH_SPLIT options NETGRAPH_SPPP options NETGRAPH_TAG options NETGRAPH_TCPMSS options NETGRAPH_TEE options NETGRAPH_UI options NETGRAPH_VJC options NETGRAPH_VLAN # NgATM - Netgraph ATM options NGATM_ATM options NGATM_ATMBASE options NGATM_SSCOP options NGATM_SSCFU options NGATM_UNI options NGATM_CCATM device mn # Munich32x/Falc54 Nx64kbit/sec cards. # Network stack virtualization. #options VIMAGE #options VNET_DEBUG # debug for VIMAGE # # Network interfaces: # The `loop' device is MANDATORY when networking is enabled. device loop # The `ether' device provides generic code to handle # Ethernets; it is MANDATORY when an Ethernet device driver is # configured or token-ring is enabled. device ether # The `vlan' device implements the VLAN tagging of Ethernet frames # according to IEEE 802.1Q. device vlan # The `vxlan' device implements the VXLAN encapsulation of Ethernet # frames in UDP packets according to RFC7348. device vxlan # The `wlan' device provides generic code to support 802.11 # drivers, including host AP mode; it is MANDATORY for the wi, # and ath drivers and will eventually be required by all 802.11 drivers. device wlan options IEEE80211_DEBUG #enable debugging msgs options IEEE80211_AMPDU_AGE #age frames in AMPDU reorder q's options IEEE80211_SUPPORT_MESH #enable 802.11s D3.0 support options IEEE80211_SUPPORT_TDMA #enable TDMA support # The `wlan_wep', `wlan_tkip', and `wlan_ccmp' devices provide # support for WEP, TKIP, and AES-CCMP crypto protocols optionally # used with 802.11 devices that depend on the `wlan' module. device wlan_wep device wlan_ccmp device wlan_tkip # The `wlan_xauth' device provides support for external (i.e. user-mode) # authenticators for use with 802.11 drivers that use the `wlan' # module and support 802.1x and/or WPA security protocols. device wlan_xauth # The `wlan_acl' device provides a MAC-based access control mechanism # for use with 802.11 drivers operating in ap mode and using the # `wlan' module. # The 'wlan_amrr' device provides AMRR transmit rate control algorithm device wlan_acl device wlan_amrr # Generic TokenRing device token # The `fddi' device provides generic code to support FDDI. device fddi # The `arcnet' device provides generic code to support Arcnet. device arcnet # The `sppp' device serves a similar role for certain types # of synchronous PPP links (like `cx', `ar'). device sppp # The `bpf' device enables the Berkeley Packet Filter. Be # aware of the legal and administrative consequences of enabling this # option. DHCP requires bpf. device bpf # The `netmap' device implements memory-mapped access to network # devices from userspace, enabling wire-speed packet capture and # generation even at 10Gbit/s. Requires support in the device # driver. Supported drivers are ixgbe, e1000, re. device netmap # The `disc' device implements a minimal network interface, # which throws away all packets sent and never receives any. It is # included for testing and benchmarking purposes. device disc # The `epair' device implements a virtual back-to-back connected Ethernet # like interface pair. device epair # The `edsc' device implements a minimal Ethernet interface, # which discards all packets sent and receives none. device edsc # The `tap' device is a pty-like virtual Ethernet interface device tap # The `tun' device implements (user-)ppp and nos-tun(8) device tun # The `gif' device implements IPv6 over IP4 tunneling, # IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and # IPv6 over IPv6 tunneling. # The `gre' device implements GRE (Generic Routing Encapsulation) tunneling, # as specified in the RFC 2784 and RFC 2890. # The `me' device implements Minimal Encapsulation within IPv4 as # specified in the RFC 2004. # The XBONEHACK option allows the same pair of addresses to be configured on # multiple gif interfaces. device gif device gre device me options XBONEHACK # The `stf' device implements 6to4 encapsulation. device stf # The pf packet filter consists of three devices: # The `pf' device provides /dev/pf and the firewall code itself. # The `pflog' device provides the pflog0 interface which logs packets. # The `pfsync' device provides the pfsync0 interface used for # synchronization of firewall state tables (over the net). device pf device pflog device pfsync # Bridge interface. device if_bridge # Common Address Redundancy Protocol. See carp(4) for more details. device carp # IPsec interface. device enc # Link aggregation interface. device lagg # # Internet family options: # # MROUTING enables the kernel multicast packet forwarder, which works # with mrouted and XORP. # # IPFIREWALL enables support for IP firewall construction, in # conjunction with the `ipfw' program. IPFIREWALL_VERBOSE sends # logged packets to the system logger. IPFIREWALL_VERBOSE_LIMIT # limits the number of times a matching entry can be logged. # # WARNING: IPFIREWALL defaults to a policy of "deny ip from any to any" # and if you do not add other rules during startup to allow access, # YOU WILL LOCK YOURSELF OUT. It is suggested that you set firewall_type=open # in /etc/rc.conf when first enabling this feature, then refining the # firewall rules in /etc/rc.firewall after you've tested that the new kernel # feature works properly. # # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to # allow everything. Use with care, if a cracker can crash your # firewall machine, they can get to your protected machines. However, # if you are using it as an as-needed filter for specific problems as # they arise, then this may be for you. Changing the default to 'allow' # means that you won't get stuck if the kernel and /sbin/ipfw binary get # out of sync. # # IPDIVERT enables the divert IP sockets, used by ``ipfw divert''. It # depends on IPFIREWALL if compiled into the kernel. # # IPFIREWALL_NAT adds support for in kernel nat in ipfw, and it requires # LIBALIAS. # # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding # packets without touching the TTL). This can be useful to hide firewalls # from traceroute and similar tools. # # PF_DEFAULT_TO_DROP causes the default pf(4) rule to deny everything. # # TCPDEBUG enables code which keeps traces of the TCP state machine # for sockets with the SO_DEBUG option set, which can then be examined # using the trpt(8) utility. # # TCPPCAP enables code which keeps the last n packets sent and received # on a TCP socket. # # RADIX_MPATH provides support for equal-cost multi-path routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall options IPFIREWALL_VERBOSE #enable logging to syslogd(8) options IPFIREWALL_VERBOSE_LIMIT=100 #limit verbosity options IPFIREWALL_DEFAULT_TO_ACCEPT #allow everything by default options IPFIREWALL_NAT #ipfw kernel nat support options IPDIVERT #divert sockets options IPFILTER #ipfilter support options IPFILTER_LOG #ipfilter logging options IPFILTER_LOOKUP #ipfilter pools options IPFILTER_DEFAULT_BLOCK #block all packets by default options IPSTEALTH #support for stealth forwarding options PF_DEFAULT_TO_DROP #drop everything by default options TCPDEBUG options TCPPCAP options RADIX_MPATH # The MBUF_STRESS_TEST option enables options which create # various random failures / extreme cases related to mbuf # functions. See mbuf(9) for a list of available test cases. # MBUF_PROFILING enables code to profile the mbuf chains # exiting the system (via participating interfaces) and # return a logarithmic histogram of monitored parameters # (e.g. packet size, wasted space, number of mbufs in chain). options MBUF_STRESS_TEST options MBUF_PROFILING # Statically link in accept filters options ACCEPT_FILTER_DATA options ACCEPT_FILTER_DNS options ACCEPT_FILTER_HTTP # TCP_SIGNATURE adds support for RFC 2385 (TCP-MD5) digests. These are # carried in TCP option 19. This option is commonly used to protect # TCP sessions (e.g. BGP) where IPSEC is not available nor desirable. # This is enabled on a per-socket basis using the TCP_MD5SIG socket option. # This requires the use of 'device crypto' and 'options IPSEC'. options TCP_SIGNATURE #include support for RFC 2385 # DUMMYNET enables the "dummynet" bandwidth limiter. You need IPFIREWALL # as well. See dummynet(4) and ipfw(8) for more info. When you run # DUMMYNET it is advisable to also have at least "options HZ=1000" to achieve # a smooth scheduling of the traffic. options DUMMYNET ##################################################################### # FILESYSTEM OPTIONS # # Only the root filesystem needs to be statically compiled or preloaded # as module; everything else will be automatically loaded at mount # time. Some people still prefer to statically compile other # filesystems as well. # # NB: The UNION filesystem was known to be buggy in the past. It is now # being actively maintained, although there are still some issues being # resolved. # # One of these is mandatory: options FFS #Fast filesystem options NFSCL #Network File System client # The rest are optional: options AUTOFS #Automounter filesystem options CD9660 #ISO 9660 filesystem options FDESCFS #File descriptor filesystem options FUSE #FUSE support module options MSDOSFS #MS DOS File System (FAT, FAT32) options NFSLOCKD #Network Lock Manager options NFSD #Network Filesystem Server options KGSSAPI #Kernel GSSAPI implementation options NULLFS #NULL filesystem options PROCFS #Process filesystem (requires PSEUDOFS) options PSEUDOFS #Pseudo-filesystem framework options PSEUDOFS_TRACE #Debugging support for PSEUDOFS options SMBFS #SMB/CIFS filesystem options TMPFS #Efficient memory filesystem options UDF #Universal Disk Format options UNIONFS #Union filesystem # The xFS_ROOT options REQUIRE the associated ``options xFS'' options NFS_ROOT #NFS usable as root device # Soft updates is a technique for improving filesystem speed and # making abrupt shutdown less risky. # options SOFTUPDATES # Extended attributes allow additional data to be associated with files, # and is used for ACLs, Capabilities, and MAC labels. # See src/sys/ufs/ufs/README.extattr for more information. options UFS_EXTATTR options UFS_EXTATTR_AUTOSTART # Access Control List support for UFS filesystems. The current ACL # implementation requires extended attribute support, UFS_EXTATTR, # for the underlying filesystem. # See src/sys/ufs/ufs/README.acls for more information. options UFS_ACL # Directory hashing improves the speed of operations on very large # directories at the expense of some memory. options UFS_DIRHASH # Gjournal-based UFS journaling support. options UFS_GJOURNAL # Make space in the kernel for a root filesystem on a md device. # Define to the number of kilobytes to reserve for the filesystem. # This is now optional. # If not defined, the root filesystem passed in as the MFS_IMAGE makeoption # will be automatically embedded in the kernel during linking. Its exact size # will be consumed within the kernel. # If defined, the old way of embedding the filesystem in the kernel will be # used. That is to say MD_ROOT_SIZE KB will be allocated in the kernel and # later, the filesystem image passed in as the MFS_IMAGE makeoption will be # dd'd into the reserved space if it fits. options MD_ROOT_SIZE=10 # Make the md device a potential root device, either with preloaded # images of type mfs_root or md_root. options MD_ROOT # Disk quotas are supported when this option is enabled. options QUOTA #enable disk quotas # If you are running a machine just as a fileserver for PC and MAC # users, using SAMBA, you may consider setting this option # and keeping all those users' directories on a filesystem that is # mounted with the suiddir option. This gives new files the same # ownership as the directory (similar to group). It's a security hole # if you let these users run programs, so confine it to file-servers # (but it'll save you lots of headaches in those cases). Root owned # directories are exempt and X bits are cleared. The suid bit must be # set on the directory as well; see chmod(1). PC owners can't see/set # ownerships so they keep getting their toes trodden on. This saves # you all the support calls as the filesystem it's used on will act as # they expect: "It's my dir so it must be my file". # options SUIDDIR # NFS options: options NFS_MINATTRTIMO=3 # VREG attrib cache timeout in sec options NFS_MAXATTRTIMO=60 options NFS_MINDIRATTRTIMO=30 # VDIR attrib cache timeout in sec options NFS_MAXDIRATTRTIMO=60 options NFS_DEBUG # Enable NFS Debugging # # Add support for the EXT2FS filesystem of Linux fame. Be a bit # careful with this - the ext2fs code has a tendency to lag behind # changes and not be exercised very much, so mounting read/write could # be dangerous (and even mounting read only could result in panics.) # options EXT2FS # # Add support for the ReiserFS filesystem (used in Linux). Currently, # this is limited to read-only access. # options REISERFS # Cryptographically secure random number generator; /dev/random device random # The system memory devices; /dev/mem, /dev/kmem device mem # The kernel symbol table device; /dev/ksyms device ksyms # Optional character code conversion support with LIBICONV. # Each option requires their base file system and LIBICONV. options CD9660_ICONV options MSDOSFS_ICONV options UDF_ICONV ##################################################################### # POSIX P1003.1B # Real time extensions added in the 1993 POSIX # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING options _KPOSIX_PRIORITY_SCHEDULING # p1003_1b_semaphores are very experimental, # user should be ready to assist in debugging if problems arise. options P1003_1B_SEMAPHORES # POSIX message queue options P1003_1B_MQUEUE ##################################################################### # SECURITY POLICY PARAMETERS # Support for BSM audit options AUDIT # Support for Mandatory Access Control (MAC): options MAC options MAC_BIBA options MAC_BSDEXTENDED options MAC_IFOFF options MAC_LOMAC options MAC_MLS options MAC_NONE options MAC_PARTITION options MAC_PORTACL options MAC_SEEOTHERUIDS options MAC_STUB options MAC_TEST # Support for Capsicum options CAPABILITIES # fine-grained rights on file descriptors options CAPABILITY_MODE # sandboxes with no global namespace access ##################################################################### # CLOCK OPTIONS # The granularity of operation is controlled by the kernel option HZ whose # default value (1000 on most architectures) means a granularity of 1ms # (1s/HZ). Historically, the default was 100, but finer granularity is # required for DUMMYNET and other systems on modern hardware. There are # reasonable arguments that HZ should, in fact, be 100 still; consider, # that reducing the granularity too much might cause excessive overhead in # clock interrupt processing, potentially causing ticks to be missed and thus # actually reducing the accuracy of operation. options HZ=100 # Enable support for the kernel PLL to use an external PPS signal, # under supervision of [x]ntpd(8) # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp options PPS_SYNC # Enable support for generic feed-forward clocks in the kernel. # The feed-forward clock support is an alternative to the feedback oriented # ntpd/system clock approach, and is to be used with a feed-forward # synchronization algorithm such as the RADclock: # More info here: http://www.synclab.org/radclock options FFCLOCK ##################################################################### # SCSI DEVICES # SCSI DEVICE CONFIGURATION # The SCSI subsystem consists of the `base' SCSI code, a number of # high-level SCSI device `type' drivers, and the low-level host-adapter # device drivers. The host adapters are listed in the ISA and PCI # device configuration sections below. # # It is possible to wire down your SCSI devices so that a given bus, # target, and LUN always come on line as the same device unit. In # earlier versions the unit numbers were assigned in the order that # the devices were probed on the SCSI bus. This means that if you # removed a disk drive, you may have had to rewrite your /etc/fstab # file, and also that you had to be careful when adding a new disk # as it may have been probed earlier and moved your device configuration # around. (See also option GEOM_VOL for a different solution to this # problem.) # This old behavior is maintained as the default behavior. The unit # assignment begins with the first non-wired down unit for a device # type. For example, if you wire a disk as "da3" then the first # non-wired disk will be assigned da4. # The syntax for wiring down devices is: hint.scbus.0.at="ahc0" hint.scbus.1.at="ahc1" hint.scbus.1.bus="0" hint.scbus.3.at="ahc2" hint.scbus.3.bus="0" hint.scbus.2.at="ahc2" hint.scbus.2.bus="1" hint.da.0.at="scbus0" hint.da.0.target="0" hint.da.0.unit="0" hint.da.1.at="scbus3" hint.da.1.target="1" hint.da.2.at="scbus2" hint.da.2.target="3" hint.sa.1.at="scbus1" hint.sa.1.target="6" # "units" (SCSI logical unit number) that are not specified are # treated as if specified as LUN 0. # All SCSI devices allocate as many units as are required. # The ch driver drives SCSI Media Changer ("jukebox") devices. # # The da driver drives SCSI Direct Access ("disk") and Optical Media # ("WORM") devices. # # The sa driver drives SCSI Sequential Access ("tape") devices. # # The cd driver drives SCSI Read Only Direct Access ("cd") devices. # # The ses driver drives SCSI Environment Services ("ses") and # SAF-TE ("SCSI Accessible Fault-Tolerant Enclosure") devices. # # The pt driver drives SCSI Processor devices. # # The sg driver provides a passthrough API that is compatible with the # Linux SG driver. It will work in conjunction with the COMPAT_LINUX # option to run linux SG apps. It can also stand on its own and provide # source level API compatibility for porting apps to FreeBSD. # # Target Mode support is provided here but also requires that a SIM # (SCSI Host Adapter Driver) provide support as well. # # The targ driver provides target mode support as a Processor type device. # It exists to give the minimal context necessary to respond to Inquiry # commands. There is a sample user application that shows how the rest # of the command support might be done in /usr/share/examples/scsi_target. # # The targbh driver provides target mode support and exists to respond # to incoming commands that do not otherwise have a logical unit assigned # to them. # # The pass driver provides a passthrough API to access the CAM subsystem. device scbus #base SCSI code device ch #SCSI media changers device da #SCSI direct access devices (aka disks) device sa #SCSI tapes device cd #SCSI CD-ROMs device ses #Enclosure Services (SES and SAF-TE) device pt #SCSI processor device targ #SCSI Target Mode Code device targbh #SCSI Target Mode Blackhole Device device pass #CAM passthrough driver device sg #Linux SCSI passthrough device ctl #CAM Target Layer # CAM OPTIONS: # debugging options: # CAMDEBUG Compile in all possible debugging. # CAM_DEBUG_COMPILE Debug levels to compile in. # CAM_DEBUG_FLAGS Debug levels to enable on boot. # CAM_DEBUG_BUS Limit debugging to the given bus. # CAM_DEBUG_TARGET Limit debugging to the given target. # CAM_DEBUG_LUN Limit debugging to the given lun. # CAM_DEBUG_DELAY Delay in us after printing each debug line. # # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter) # queue after a bus reset, and the number of milliseconds to # freeze the device queue after a bus device reset. This # can be changed at boot and runtime with the # kern.cam.scsi_delay tunable/sysctl. options CAMDEBUG options CAM_DEBUG_COMPILE=-1 options CAM_DEBUG_FLAGS=(CAM_DEBUG_INFO|CAM_DEBUG_PROBE|CAM_DEBUG_PERIPH) options CAM_DEBUG_BUS=-1 options CAM_DEBUG_TARGET=-1 options CAM_DEBUG_LUN=-1 options CAM_DEBUG_DELAY=1 options CAM_MAX_HIGHPOWER=4 options SCSI_NO_SENSE_STRINGS options SCSI_NO_OP_STRINGS options SCSI_DELAY=5000 # Be pessimistic about Joe SCSI device # Options for the CAM CDROM driver: # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only # enforced if there is I/O waiting for another LUN # The compiled in defaults for these variables are 2 and 10 seconds, # respectively. # # These can also be changed on the fly with the following sysctl variables: # kern.cam.cd.changer.min_busy_seconds # kern.cam.cd.changer.max_busy_seconds # options CHANGER_MIN_BUSY_SECONDS=2 options CHANGER_MAX_BUSY_SECONDS=10 # Options for the CAM sequential access driver: # SA_IO_TIMEOUT: Timeout for read/write/wfm operations, in minutes # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT. options SA_IO_TIMEOUT=4 options SA_SPACE_TIMEOUT=60 options SA_REWIND_TIMEOUT=(2*60) options SA_ERASE_TIMEOUT=(4*60) options SA_1FM_AT_EOD # Optional timeout for the CAM processor target (pt) device # This is specified in seconds. The default is 60 seconds. options SCSI_PT_DEFAULT_TIMEOUT=60 # Optional enable of doing SES passthrough on other devices (e.g., disks) # # Normally disabled because a lot of newer SCSI disks report themselves # as having SES capabilities, but this can then clot up attempts to build # a topology with the SES device that's on the box these drives are in.... options SES_ENABLE_PASSTHROUGH ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS device pty #BSD-style compatibility pseudo ttys device nmdm #back-to-back tty devices device md #Memory/malloc disk device snp #Snoop device - to look at pty/vty/etc.. device ccd #Concatenated disk driver device firmware #firmware(9) support # Kernel side iconv library options LIBICONV # Size of the kernel message buffer. Should be N * pagesize. options MSGBUF_SIZE=40960 ##################################################################### # HARDWARE DEVICE CONFIGURATION # For ISA the required hints are listed. # EISA, MCA, PCI, CardBus, SD/MMC and pccard are self identifying buses, so # no hints are needed. # # Mandatory devices: # # These options are valid for other keyboard drivers as well. options KBD_DISABLE_KEYMAP_LOAD # refuse to load a keymap options KBD_INSTALL_CDEV # install a CDEV entry in /dev device kbdmux # keyboard multiplexer options KBDMUX_DFLT_KEYMAP # specify the built-in keymap makeoptions KBDMUX_DFLT_KEYMAP=it.iso options FB_DEBUG # Frame buffer debugging device splash # Splash screen and screen saver support # Various screen savers. device blank_saver device daemon_saver device dragon_saver device fade_saver device fire_saver device green_saver device logo_saver device rain_saver device snake_saver device star_saver device warp_saver # The syscons console driver (SCO color console compatible). device sc hint.sc.0.at="isa" options MAXCONS=16 # number of virtual consoles options SC_ALT_MOUSE_IMAGE # simplified mouse cursor in text mode options SC_DFLT_FONT # compile font in makeoptions SC_DFLT_FONT=cp850 options SC_DISABLE_KDBKEY # disable `debug' key options SC_DISABLE_REBOOT # disable reboot key sequence options SC_HISTORY_SIZE=200 # number of history buffer lines options SC_MOUSE_CHAR=0x3 # char code for text mode mouse cursor options SC_PIXEL_MODE # add support for the raster text mode # The following options will let you change the default colors of syscons. options SC_NORM_ATTR=(FG_GREEN|BG_BLACK) options SC_NORM_REV_ATTR=(FG_YELLOW|BG_GREEN) options SC_KERNEL_CONS_ATTR=(FG_RED|BG_BLACK) options SC_KERNEL_CONS_REV_ATTR=(FG_BLACK|BG_RED) # The following options will let you change the default behavior of # cut-n-paste feature options SC_CUT_SPACES2TABS # convert leading spaces into tabs options SC_CUT_SEPCHARS=\"x09\" # set of characters that delimit words # (default is single space - \"x20\") # If you have a two button mouse, you may want to add the following option # to use the right button of the mouse to paste text. options SC_TWOBUTTON_MOUSE # You can selectively disable features in syscons. options SC_NO_CUTPASTE options SC_NO_FONT_LOADING options SC_NO_HISTORY options SC_NO_MODE_CHANGE options SC_NO_SYSMOUSE options SC_NO_SUSPEND_VTYSWITCH # `flags' for sc # 0x80 Put the video card in the VESA 800x600 dots, 16 color mode # 0x100 Probe for a keyboard device periodically if one is not present # Enable experimental features of the syscons terminal emulator (teken). options TEKEN_CONS25 # cons25-style terminal emulation options TEKEN_UTF8 # UTF-8 output handling # The vt video console driver. device vt options VT_ALT_TO_ESC_HACK=1 # Prepend ESC sequence to ALT keys options VT_MAXWINDOWS=16 # Number of virtual consoles options VT_TWOBUTTON_MOUSE # Use right mouse button to paste # The following options set the default framebuffer size. options VT_FB_DEFAULT_HEIGHT=480 options VT_FB_DEFAULT_WIDTH=640 # The following options will let you change the default vt terminal colors. options TERMINAL_NORM_ATTR=(FG_GREEN|BG_BLACK) options TERMINAL_KERN_ATTR=(FG_LIGHTRED|BG_BLACK) # # Optional devices: # # # SCSI host adapters: # # adv: All Narrow SCSI bus AdvanSys controllers. # adw: Second Generation AdvanSys controllers including the ADV940UW. # aha: Adaptec 154x/1535/1640 # ahb: Adaptec 174x EISA controllers # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/ # 19160x/29160x, aic7770/aic78xx # ahd: Adaptec 29320/39320 Controllers. # aic: Adaptec 6260/6360, APA-1460 (PC Card), NEC PC9801-100 (C-BUS) # bt: Most Buslogic controllers: including BT-445, BT-54x, BT-64x, BT-74x, # BT-75x, BT-946, BT-948, BT-956, BT-958, SDC3211B, SDC3211F, SDC3222F # esp: Emulex ESP, NCR 53C9x and QLogic FAS families based controllers # including the AMD Am53C974 (found on devices such as the Tekram # DC-390(T)) and the Sun ESP and FAS families of controllers # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters, # ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2, # ISP 12160 Ultra3 SCSI, # Qlogic ISP 2100 and ISP 2200 1Gb Fibre Channel host adapters. # Qlogic ISP 2300 and ISP 2312 2Gb Fibre Channel host adapters. # Qlogic ISP 2322 and ISP 6322 2Gb Fibre Channel host adapters. # ispfw: Firmware module for Qlogic host adapters # mpt: LSI-Logic MPT/Fusion 53c1020 or 53c1030 Ultra4 # or FC9x9 Fibre Channel host adapters. # ncr: NCR 53C810, 53C825 self-contained SCSI host adapters. # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors: # 53C810, 53C810A, 53C815, 53C825, 53C825A, 53C860, 53C875, # 53C876, 53C885, 53C895, 53C895A, 53C896, 53C897, 53C1510D, # 53C1010-33, 53C1010-66. # trm: Tekram DC395U/UW/F DC315U adapters. # wds: WD7000 # # Note that the order is important in order for Buslogic ISA/EISA cards to be # probed correctly. # device bt hint.bt.0.at="isa" hint.bt.0.port="0x330" device adv hint.adv.0.at="isa" device adw device aha hint.aha.0.at="isa" device aic hint.aic.0.at="isa" device ahb device ahc device ahd device esp device iscsi_initiator device isp hint.isp.0.disable="1" hint.isp.0.role="3" hint.isp.0.prefer_iomap="1" hint.isp.0.prefer_memmap="1" hint.isp.0.fwload_disable="1" hint.isp.0.ignore_nvram="1" hint.isp.0.fullduplex="1" hint.isp.0.topology="lport" hint.isp.0.topology="nport" hint.isp.0.topology="lport-only" hint.isp.0.topology="nport-only" # we can't get u_int64_t types, nor can we get strings if it's got # a leading 0x, hence this silly dodge. hint.isp.0.portwnn="w50000000aaaa0000" hint.isp.0.nodewnn="w50000000aaaa0001" device ispfw device mpt device ncr device sym device trm device wds hint.wds.0.at="isa" hint.wds.0.port="0x350" hint.wds.0.irq="11" hint.wds.0.drq="6" # The aic7xxx driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. Unfortunately, # this doesn't work on some motherboards, which prevents it from being the # default. options AHC_ALLOW_MEMIO # Dump the contents of the ahc controller configuration PROM. options AHC_DUMP_EEPROM # Bitmap of units to enable targetmode operations. options AHC_TMODE_ENABLE # Compile in Aic7xxx Debugging code. options AHC_DEBUG # Aic7xxx driver debugging options. See sys/dev/aic7xxx/aic7xxx.h options AHC_DEBUG_OPTS # Print register bitfields in debug output. Adds ~128k to driver # See ahc(4). options AHC_REG_PRETTY_PRINT # Compile in aic79xx debugging code. options AHD_DEBUG # Aic79xx driver debugging options. Adds ~215k to driver. See ahd(4). options AHD_DEBUG_OPTS=0xFFFFFFFF # Print human-readable register definitions when debugging options AHD_REG_PRETTY_PRINT # Bitmap of units to enable targetmode operations. options AHD_TMODE_ENABLE # The adw driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. options ADW_ALLOW_MEMIO # Options used in dev/iscsi (Software iSCSI stack) # options ISCSI_INITIATOR_DEBUG=9 # Options used in dev/isp/ (Qlogic SCSI/FC driver). # # ISP_TARGET_MODE - enable target mode operation # options ISP_TARGET_MODE=1 # # ISP_DEFAULT_ROLES - default role # none=0 # target=1 # initiator=2 # both=3 (not supported currently) # # ISP_INTERNAL_TARGET (trivial internal disk target, for testing) # options ISP_DEFAULT_ROLES=0 # Options used in dev/sym/ (Symbios SCSI driver). #options SYM_SETUP_LP_PROBE_MAP #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d #options SYM_SETUP_SCSI_DIFF #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 #options SYM_SETUP_PCI_PARITY #-PCI parity checking # disabled:0, enabled:1 (default) #options SYM_SETUP_MAX_LUN #-Number of LUNs supported # default:8, range:[1..64] # The 'dpt' driver provides support for old DPT controllers (http://www.dpt.com/). # These have hardware RAID-{0,1,5} support, and do multi-initiator I/O. # The DPT controllers are commonly re-licensed under other brand-names - # some controllers by Olivetti, Dec, HP, AT&T, SNI, AST, Alphatronic, NEC and # Compaq are actually DPT controllers. # # See src/sys/dev/dpt for debugging and other subtle options. # DPT_MEASURE_PERFORMANCE Enables a set of (semi)invasive metrics. Various # instruments are enabled. The tools in # /usr/sbin/dpt_* assume these to be enabled. # DPT_DEBUG_xxxx These are controllable from sys/dev/dpt/dpt.h # DPT_RESET_HBA Make "reset" actually reset the controller # instead of fudging it. Only enable this if you # are 100% certain you need it. device dpt # DPT options #!CAM# options DPT_MEASURE_PERFORMANCE options DPT_RESET_HBA # # Compaq "CISS" RAID controllers (SmartRAID 5* series) # These controllers have a SCSI-like interface, and require the # CAM infrastructure. # device ciss # # Intel Integrated RAID controllers. # This driver was developed and is maintained by Intel. Contacts # at Intel for this driver are # "Kannanthanam, Boji T" and # "Leubner, Achim" . # device iir # # Mylex AcceleRAID and eXtremeRAID controllers with v6 and later # firmware. These controllers have a SCSI-like interface, and require # the CAM infrastructure. # device mly # # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers. Only # one entry is needed; the code will find and configure all supported # controllers. # device ida # Compaq Smart RAID device mlx # Mylex DAC960 device amr # AMI MegaRAID device amrp # SCSI Passthrough interface (optional, CAM req.) device mfi # LSI MegaRAID SAS device mfip # LSI MegaRAID SAS passthrough, requires CAM options MFI_DEBUG device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s # # 3ware ATA RAID # device twe # 3ware ATA RAID # # Serial ATA host controllers: # # ahci: Advanced Host Controller Interface (AHCI) compatible # mvs: Marvell 88SX50XX/88SX60XX/88SX70XX/SoC controllers # siis: SiliconImage SiI3124/SiI3132/SiI3531 controllers # # These drivers are part of cam(4) subsystem. They supersede less featured # ata(4) subsystem drivers, supporting same hardware. device ahci device mvs device siis # # The 'ATA' driver supports all legacy ATA/ATAPI controllers, including # PC Card devices. You only need one "device ata" for it to find all # PCI and PC Card ATA/ATAPI devices on modern machines. # Alternatively, individual bus and chipset drivers may be chosen by using # the 'atacore' driver then selecting the drivers on a per vendor basis. # For example to build a system which only supports a VIA chipset, # omit 'ata' and include the 'atacore', 'atapci' and 'atavia' drivers. device ata # Modular ATA #device atacore # Core ATA functionality #device atacard # CARDBUS support #device atabus # PC98 cbus support #device ataisa # ISA bus support #device atapci # PCI bus support; only generic chipset support # PCI ATA chipsets #device ataacard # ACARD #device ataacerlabs # Acer Labs Inc. (ALI) #device ataamd # American Micro Devices (AMD) #device ataati # ATI #device atacenatek # Cenatek #device atacypress # Cypress #device atacyrix # Cyrix #device atahighpoint # HighPoint #device ataintel # Intel #device ataite # Integrated Technology Inc. (ITE) #device atajmicron # JMicron #device atamarvell # Marvell #device atamicron # Micron #device atanational # National #device atanetcell # NetCell #device atanvidia # nVidia #device atapromise # Promise #device ataserverworks # ServerWorks #device atasiliconimage # Silicon Image Inc. (SiI) (formerly CMD) #device atasis # Silicon Integrated Systems Corp.(SiS) #device atavia # VIA Technologies Inc. # # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add: hint.ata.0.at="isa" hint.ata.0.port="0x1f0" hint.ata.0.irq="14" hint.ata.1.at="isa" hint.ata.1.port="0x170" hint.ata.1.irq="15" # # The following options are valid on the ATA driver: # # ATA_REQUEST_TIMEOUT: the number of seconds to wait for an ATA request # before timing out. #options ATA_REQUEST_TIMEOUT=10 # # Standard floppy disk controllers and floppy tapes, supports # the Y-E DATA External FDD (PC Card) # device fdc hint.fdc.0.at="isa" hint.fdc.0.port="0x3F0" hint.fdc.0.irq="6" hint.fdc.0.drq="2" # # FDC_DEBUG enables floppy debugging. Since the debug output is huge, you # gotta turn it actually on by setting the variable fd_debug with DDB, # however. options FDC_DEBUG # # Activate this line if you happen to have an Insight floppy tape. # Probing them proved to be dangerous for people with floppy disks only, # so it's "hidden" behind a flag: #hint.fdc.0.flags="1" # Specify floppy devices hint.fd.0.at="fdc0" hint.fd.0.drive="0" hint.fd.1.at="fdc0" hint.fd.1.drive="1" # # uart: newbusified driver for serial interfaces. It consolidates the sio(4), # sab(4) and zs(4) drivers. # device uart # Options for uart(4) options UART_PPS_ON_CTS # Do time pulse capturing using CTS # instead of DCD. options UART_POLL_FREQ # Set polling rate, used when hw has # no interrupt support (50 Hz default). # The following hint should only be used for pure ISA devices. It is not # needed otherwise. Use of hints is strongly discouraged. hint.uart.0.at="isa" # The following 3 hints are used when the UART is a system device (i.e., a # console or debug port), but only on platforms that don't have any other # means to pass the information to the kernel. The unit number of the hint # is only used to bundle the hints together. There is no relation to the # unit number of the probed UART. hint.uart.0.port="0x3f8" hint.uart.0.flags="0x10" hint.uart.0.baud="115200" # `flags' for serial drivers that support consoles like sio(4) and uart(4): # 0x10 enable console support for this unit. Other console flags # (if applicable) are ignored unless this is set. Enabling # console support does not make the unit the preferred console. # Boot with -h or set boot_serial=YES in the loader. For sio(4) # specifically, the 0x20 flag can also be set (see above). # Currently, at most one unit can have console support; the # first one (in config file order) with this flag set is # preferred. Setting this flag for sio0 gives the old behavior. # 0x80 use this port for serial line gdb support in ddb. Also known # as debug port. # # Options for serial drivers that support consoles: options BREAK_TO_DEBUGGER # A BREAK/DBG on the console goes to # ddb, if available. # Solaris implements a new BREAK which is initiated by a character # sequence CR ~ ^b which is similar to a familiar pattern used on # Sun servers by the Remote Console. There are FreeBSD extensions: # CR ~ ^p requests force panic and CR ~ ^r requests a clean reboot. options ALT_BREAK_TO_DEBUGGER # Serial Communications Controller # Supports the Siemens SAB 82532 and Zilog Z8530 multi-channel # communications controllers. device scc # PCI Universal Communications driver # Supports various multi port PCI I/O cards. device puc # # Network interfaces: # # MII bus support is required for many PCI Ethernet NICs, # namely those which use MII-compliant transceivers or implement # transceiver control interfaces that operate like an MII. Adding # "device miibus" to the kernel config pulls in support for the generic # miibus API, the common support for for bit-bang'ing the MII and all # of the PHY drivers, including a generic one for PHYs that aren't # specifically handled by an individual driver. Support for specific # PHYs may be built by adding "device mii", "device mii_bitbang" if # needed by the NIC driver and then adding the appropriate PHY driver. device mii # Minimal MII support device mii_bitbang # Common module for bit-bang'ing the MII device miibus # MII support w/ bit-bang'ing and all PHYs device acphy # Altima Communications AC101 device amphy # AMD AM79c873 / Davicom DM910{1,2} device atphy # Attansic/Atheros F1 device axphy # Asix Semiconductor AX88x9x device bmtphy # Broadcom BCM5201/BCM5202 and 3Com 3c905C device brgphy # Broadcom BCM54xx/57xx 1000baseTX device ciphy # Cicada/Vitesse CS/VSC8xxx device e1000phy # Marvell 88E1000 1000/100/10-BT device gentbi # Generic 10-bit 1000BASE-{LX,SX} fiber ifaces device icsphy # ICS ICS1889-1893 device ip1000phy # IC Plus IP1000A/IP1001 device jmphy # JMicron JMP211/JMP202 device lxtphy # Level One LXT-970 device mlphy # Micro Linear 6692 device nsgphy # NatSemi DP8361/DP83865/DP83891 device nsphy # NatSemi DP83840A device nsphyter # NatSemi DP83843/DP83815 device pnaphy # HomePNA device qsphy # Quality Semiconductor QS6612 device rdcphy # RDC Semiconductor R6040 device rgephy # RealTek 8169S/8110S/8211B/8211C device rlphy # RealTek 8139 device rlswitch # RealTek 8305 device smcphy # SMSC LAN91C111 device tdkphy # TDK 89Q2120 device tlphy # Texas Instruments ThunderLAN device truephy # LSI TruePHY device xmphy # XaQti XMAC II # an: Aironet 4500/4800 802.11 wireless adapters. Supports the PCMCIA, # PCI and ISA varieties. # ae: Support for gigabit ethernet adapters based on the Attansic/Atheros # L2 PCI-Express FastEthernet controllers. # age: Support for gigabit ethernet adapters based on the Attansic/Atheros # L1 PCI express gigabit ethernet controllers. # alc: Support for Atheros AR8131/AR8132 PCIe ethernet controllers. # ale: Support for Atheros AR8121/AR8113/AR8114 PCIe ethernet controllers. # ath: Atheros a/b/g WiFi adapters (requires ath_hal and wlan) # bce: Broadcom NetXtreme II (BCM5706/BCM5708) PCI/PCIe Gigabit Ethernet # adapters. # bfe: Broadcom BCM4401 Ethernet adapter. # bge: Support for gigabit ethernet adapters based on the Broadcom # BCM570x family of controllers, including the 3Com 3c996-T, # the Netgear GA302T, the SysKonnect SK-9D21 and SK-9D41, and # the embedded gigE NICs on Dell PowerEdge 2550 servers. # bxe: Broadcom NetXtreme II (BCM5771X/BCM578XX) PCIe 10Gb Ethernet # adapters. # bwi: Broadcom BCM430* and BCM431* family of wireless adapters. # bwn: Broadcom BCM43xx family of wireless adapters. # cas: Sun Cassini/Cassini+ and National Semiconductor DP83065 Saturn # cm: Arcnet SMC COM90c26 / SMC COM90c56 # (and SMC COM90c66 in '56 compatibility mode) adapters. # cxgb: Chelsio T3 based 1GbE/10GbE PCIe Ethernet adapters. # cxgbe:Chelsio T4 and T5 based 1GbE/10GbE/40GbE PCIe Ethernet adapters. # dc: Support for PCI fast ethernet adapters based on the DEC/Intel 21143 # and various workalikes including: # the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics # AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On # 82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II # and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver # replaces the old al, ax, dm, pn and mx drivers. List of brands: # Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110, # SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX, # LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204, # KNE110TX. # de: Digital Equipment DC21040 # em: Intel Pro/1000 Gigabit Ethernet 82542, 82543, 82544 based adapters. # igb: Intel Pro/1000 PCI Express Gigabit Ethernet: 82575 and later adapters. # ep: 3Com 3C509, 3C529, 3C556, 3C562D, 3C563D, 3C572, 3C574X, 3C579, 3C589 # and PC Card devices using these chipsets. # ex: Intel EtherExpress Pro/10 and other i82595-based adapters, # Olicom Ethernet PC Card devices. # fe: Fujitsu MB86960A/MB86965A Ethernet # fea: DEC DEFEA EISA FDDI adapter # fpa: Support for the Digital DEFPA PCI FDDI. `device fddi' is also needed. # fxp: Intel EtherExpress Pro/100B # (hint of prefer_iomap can be done to prefer I/O instead of Mem mapping) # gem: Apple GMAC/Sun ERI/Sun GEM # hme: Sun HME (Happy Meal Ethernet) # jme: JMicron JMC260 Fast Ethernet/JMC250 Gigabit Ethernet based adapters. # le: AMD Am7900 LANCE and Am79C9xx PCnet # lge: Support for PCI gigabit ethernet adapters based on the Level 1 # LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX, # SMC TigerCard 1000 (SMC9462SX), and some Addtron cards. # malo: Marvell Libertas wireless NICs. # mwl: Marvell 88W8363 802.11n wireless NICs. # Requires the mwl firmware module # mwlfw: Marvell 88W8363 firmware # msk: Support for gigabit ethernet adapters based on the Marvell/SysKonnect # Yukon II Gigabit controllers, including 88E8021, 88E8022, 88E8061, # 88E8062, 88E8035, 88E8036, 88E8038, 88E8050, 88E8052, 88E8053, # 88E8055, 88E8056 and D-Link 560T/550SX. # lmc: Support for the LMC/SBE wide-area network interface cards. # mlx5: Mellanox ConnectX-4 and ConnectX-4 LX IB and Eth shared code module. # mlx5en:Mellanox ConnectX-4 and ConnectX-4 LX PCIe Ethernet adapters. # my: Myson Fast Ethernet (MTD80X, MTD89X) # nge: Support for PCI gigabit ethernet adapters based on the National # Semiconductor DP83820 and DP83821 chipset. This includes the # SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet # GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the Surecom # EP-320G-TX and the Netgear GA622T. # oce: Emulex 10 Gbit adapters (OneConnect Ethernet) # pcn: Support for PCI fast ethernet adapters based on the AMD Am79c97x # PCnet-FAST, PCnet-FAST+, PCnet-FAST III, PCnet-PRO and PCnet-Home # chipsets. These can also be handled by the le(4) driver if the # pcn(4) driver is left out of the kernel. The le(4) driver does not # support the additional features like the MII bus and burst mode of # the PCnet-FAST and greater chipsets though. # ral: Ralink Technology IEEE 802.11 wireless adapter # re: RealTek 8139C+/8169/816xS/811xS/8101E PCI/PCIe Ethernet adapter # rl: Support for PCI fast ethernet adapters based on the RealTek 8129/8139 # chipset. Note that the RealTek driver defaults to using programmed # I/O to do register accesses because memory mapped mode seems to cause # severe lockups on SMP hardware. This driver also supports the # Accton EN1207D `Cheetah' adapter, which uses a chip called # the MPX 5030/5038, which is either a RealTek in disguise or a # RealTek workalike. Note that the D-Link DFE-530TX+ uses the RealTek # chipset and is supported by this driver, not the 'vr' driver. # sf: Support for Adaptec Duralink PCI fast ethernet adapters based on the # Adaptec AIC-6915 "starfire" controller. # This includes dual and quad port cards, as well as one 100baseFX card. # Most of these are 64-bit PCI devices, except for one single port # card which is 32-bit. # sge: Silicon Integrated Systems SiS190/191 Fast/Gigabit Ethernet adapter # sis: Support for NICs based on the Silicon Integrated Systems SiS 900, # SiS 7016 and NS DP83815 PCI fast ethernet controller chips. # sk: Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs. # This includes the SK-9841 and SK-9842 single port cards (single mode # and multimode fiber) and the SK-9843 and SK-9844 dual port cards # (also single mode and multimode). # The driver will autodetect the number of ports on the card and # attach each one as a separate network interface. # sn: Support for ISA and PC Card Ethernet devices using the # SMC91C90/92/94/95 chips. # ste: Sundance Technologies ST201 PCI fast ethernet controller, includes # the D-Link DFE-550TX. # stge: Support for gigabit ethernet adapters based on the Sundance/Tamarack # TC9021 family of controllers, including the Sundance ST2021/ST2023, # the Sundance/Tamarack TC9021, the D-Link DL-4000 and ASUS NX1101. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the # 3Com 3c985, the Netgear GA620 and various others. Note that you will # probably want to bump up kern.ipc.nmbclusters a lot to use this driver. # tl: Support for the Texas Instruments TNETE100 series 'ThunderLAN' # cards and integrated ethernet controllers. This includes several # Compaq Netelligent 10/100 cards and the built-in ethernet controllers # in several Compaq Prosignia, Proliant and Deskpro systems. It also # supports several Olicom 10Mbps and 10/100 boards. # tx: SMC 9432 TX, BTX and FTX cards. (SMC EtherPower II series) # txp: Support for 3Com 3cR990 cards with the "Typhoon" chipset # vr: Support for various fast ethernet adapters based on the VIA # Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips, # including the D-Link DFE520TX and D-Link DFE530TX (see 'rl' for # DFE530TX+), the Hawking Technologies PN102TX, and the AOpen/Acer ALN-320. # vte: DM&P Vortex86 RDC R6040 Fast Ethernet # vx: 3Com 3C590 and 3C595 # wb: Support for fast ethernet adapters based on the Winbond W89C840F chip. # Note: this is not the same as the Winbond W89C940F, which is a # NE2000 clone. # wi: Lucent WaveLAN/IEEE 802.11 PCMCIA adapters. Note: this supports both # the PCMCIA and ISA cards: the ISA card is really a PCMCIA to ISA # bridge with a PCMCIA adapter plugged into it. # xe: Xircom/Intel EtherExpress Pro100/16 PC Card ethernet controller, # Accton Fast EtherCard-16, Compaq Netelligent 10/100 PC Card, # Toshiba 10/100 Ethernet PC Card, Xircom 16-bit Ethernet + Modem 56 # xl: Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast) # Etherlink XL cards and integrated controllers. This includes the # integrated 3c905B-TX chips in certain Dell Optiplex and Dell # Precision desktop machines and the integrated 3c905-TX chips # in Dell Latitude laptop docking stations. # Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX # Order for ISA/EISA devices is important here device cm hint.cm.0.at="isa" hint.cm.0.port="0x2e0" hint.cm.0.irq="9" hint.cm.0.maddr="0xdc000" device ep device ex device fe hint.fe.0.at="isa" hint.fe.0.port="0x300" device fea device sn hint.sn.0.at="isa" hint.sn.0.port="0x300" hint.sn.0.irq="10" device an device wi device xe # PCI Ethernet NICs that use the common MII bus controller code. device ae # Attansic/Atheros L2 FastEthernet device age # Attansic/Atheros L1 Gigabit Ethernet device alc # Atheros AR8131/AR8132 Ethernet device ale # Atheros AR8121/AR8113/AR8114 Ethernet device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet device bfe # Broadcom BCM440x 10/100 Ethernet device bge # Broadcom BCM570xx Gigabit Ethernet device cas # Sun Cassini/Cassini+ and NS DP83065 Saturn device cxgb # Chelsio T3 10 Gigabit Ethernet device cxgb_t3fw # Chelsio T3 10 Gigabit Ethernet firmware device cxgbe # Chelsio T4 and T5 1GbE/10GbE/40GbE device dc # DEC/Intel 21143 and various workalikes device et # Agere ET1310 10/100/Gigabit Ethernet device fxp # Intel EtherExpress PRO/100B (82557, 82558) hint.fxp.0.prefer_iomap="0" device gem # Apple GMAC/Sun ERI/Sun GEM device hme # Sun HME (Happy Meal Ethernet) device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet device lge # Level 1 LXT1001 gigabit Ethernet device mlx5 # Shared code module between IB and Ethernet device mlx5en # Mellanox ConnectX-4 and ConnectX-4 LX device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet device my # Myson Fast Ethernet (MTD80X, MTD89X) device nge # NatSemi DP83820 gigabit Ethernet device re # RealTek 8139C+/8169/8169S/8110S device rl # RealTek 8129/8139 device pcn # AMD Am79C97x PCI 10/100 NICs device sf # Adaptec AIC-6915 (``Starfire'') device sge # Silicon Integrated Systems SiS190/191 device sis # Silicon Integrated Systems SiS 900/SiS 7016 device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet device ste # Sundance ST201 (D-Link DFE-550TX) device stge # Sundance/Tamarack TC9021 gigabit Ethernet device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device vte # DM&P Vortex86 RDC R6040 Fast Ethernet device wb # Winbond W89C840F device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device em # Intel Pro/1000 Gigabit Ethernet device igb # Intel Pro/1000 PCIE Gigabit Ethernet device ixgb # Intel Pro/10Gbe PCI-X Ethernet device ix # Intel Pro/10Gbe PCIE Ethernet device ixv # Intel Pro/10Gbe PCIE Ethernet VF device le # AMD Am7900 LANCE and Am79C9xx PCnet device mxge # Myricom Myri-10G 10GbE NIC device nxge # Neterion Xframe 10GbE Server/Storage Adapter device oce # Emulex 10 GbE (OneConnect Ethernet) device ti # Alteon Networks Tigon I/II gigabit Ethernet device txp # 3Com 3cR990 (``Typhoon'') device vx # 3Com 3c590, 3c595 (``Vortex'') device vxge # Exar/Neterion XFrame 3100 10GbE # PCI FDDI NICs. device fpa # PCI WAN adapters. device lmc # PCI IEEE 802.11 Wireless NICs device ath # Atheros pci/cardbus NIC's device ath_hal # pci/cardbus chip support #device ath_ar5210 # AR5210 chips #device ath_ar5211 # AR5211 chips #device ath_ar5212 # AR5212 chips #device ath_rf2413 #device ath_rf2417 #device ath_rf2425 #device ath_rf5111 #device ath_rf5112 #device ath_rf5413 #device ath_ar5416 # AR5416 chips options AH_SUPPORT_AR5416 # enable AR5416 tx/rx descriptors # All of the AR5212 parts have a problem when paired with the AR71xx # CPUS. These parts have a bug that triggers a fatal bus error on the AR71xx # only. Details of the exact nature of the bug are sketchy, but some can be # found at https://forum.openwrt.org/viewtopic.php?pid=70060 on pages 4, 5 and # 6. This option enables this workaround. There is a performance penalty # for this work around, but without it things don't work at all. The DMA # from the card usually bursts 128 bytes, but on the affected CPUs, only # 4 are safe. options AH_RXCFG_SDMAMW_4BYTES #device ath_ar9160 # AR9160 chips #device ath_ar9280 # AR9280 chips #device ath_ar9285 # AR9285 chips device ath_rate_sample # SampleRate tx rate control for ath device bwi # Broadcom BCM430* BCM431* device bwn # Broadcom BCM43xx device malo # Marvell Libertas wireless NICs. device mwl # Marvell 88W8363 802.11n wireless NICs. device mwlfw device ral # Ralink Technology RT2500 wireless NICs. # Use sf_buf(9) interface for jumbo buffers on ti(4) controllers. #options TI_SF_BUF_JUMBO # Turn on the header splitting option for the ti(4) driver firmware. This # only works for Tigon II chips, and has no effect for Tigon I chips. # This option requires the TI_SF_BUF_JUMBO option above. #options TI_JUMBO_HDRSPLIT # These two options allow manipulating the mbuf cluster size and mbuf size, # respectively. Be very careful with NIC driver modules when changing # these from their default values, because that can potentially cause a # mismatch between the mbuf size assumed by the kernel and the mbuf size # assumed by a module. The only driver that currently has the ability to # detect a mismatch is ti(4). options MCLSHIFT=12 # mbuf cluster shift in bits, 12 == 4KB options MSIZE=512 # mbuf size in bytes # # ATM related options (Cranor version) # (note: this driver cannot be used with the HARP ATM stack) # # The `en' device provides support for Efficient Networks (ENI) # ENI-155 PCI midway cards, and the Adaptec 155Mbps PCI ATM cards (ANA-59x0). # # The `hatm' device provides support for Fore/Marconi HE155 and HE622 # ATM PCI cards. # # The `fatm' device provides support for Fore PCA200E ATM PCI cards. # # The `patm' device provides support for IDT77252 based cards like # ProSum's ProATM-155 and ProATM-25 and IDT's evaluation boards. # # atm device provides generic atm functions and is required for # atm devices. # NATM enables the netnatm protocol family that can be used to # bypass TCP/IP. # # utopia provides the access to the ATM PHY chips and is required for en, # hatm and fatm. # # the current driver supports only PVC operations (no atm-arp, no multicast). # for more details, please read the original documents at # http://www.ccrc.wustl.edu/pub/chuck/tech/bsdatm/bsdatm.html # device atm device en device fatm #Fore PCA200E device hatm #Fore/Marconi HE155/622 device patm #IDT77252 cards (ProATM and IDT) device utopia #ATM PHY driver options NATM #native ATM options LIBMBPOOL #needed by patm, iatm # # Sound drivers # # sound: The generic sound driver. # device sound # # snd_*: Device-specific drivers. # # The flags of the device tell the device a bit more info about the # device that normally is obtained through the PnP interface. # bit 2..0 secondary DMA channel; # bit 4 set if the board uses two dma channels; # bit 15..8 board type, overrides autodetection; leave it # zero if don't know what to put in (and you don't, # since this is unsupported at the moment...). # # snd_ad1816: Analog Devices AD1816 ISA PnP/non-PnP. # snd_als4000: Avance Logic ALS4000 PCI. # snd_atiixp: ATI IXP 200/300/400 PCI. # snd_audiocs: Crystal Semiconductor CS4231 SBus/EBus. Only # for sparc64. # snd_cmi: CMedia CMI8338/CMI8738 PCI. # snd_cs4281: Crystal Semiconductor CS4281 PCI. # snd_csa: Crystal Semiconductor CS461x/428x PCI. (except # 4281) # snd_ds1: Yamaha DS-1 PCI. # snd_emu10k1: Creative EMU10K1 PCI and EMU10K2 (Audigy) PCI. # snd_emu10kx: Creative SoundBlaster Live! and Audigy # snd_envy24: VIA Envy24 and compatible, needs snd_spicds. # snd_envy24ht: VIA Envy24HT and compatible, needs snd_spicds. # snd_es137x: Ensoniq AudioPCI ES137x PCI. # snd_ess: Ensoniq ESS ISA PnP/non-PnP, to be used in # conjunction with snd_sbc. # snd_fm801: Forte Media FM801 PCI. # snd_gusc: Gravis UltraSound ISA PnP/non-PnP. # snd_hda: Intel High Definition Audio (Controller) and # compatible. # snd_hdspe: RME HDSPe AIO and RayDAT. # snd_ich: Intel ICH AC'97 and some more audio controllers # embedded in a chipset, for example nVidia # nForce controllers. # snd_maestro: ESS Technology Maestro-1/2x PCI. # snd_maestro3: ESS Technology Maestro-3/Allegro PCI. # snd_mss: Microsoft Sound System ISA PnP/non-PnP. # snd_neomagic: Neomagic 256 AV/ZX PCI. # snd_sb16: Creative SoundBlaster16, to be used in # conjunction with snd_sbc. # snd_sb8: Creative SoundBlaster (pre-16), to be used in # conjunction with snd_sbc. # snd_sbc: Creative SoundBlaster ISA PnP/non-PnP. # Supports ESS and Avance ISA chips as well. # snd_solo: ESS Solo-1x PCI. # snd_spicds: SPI codec driver, needed by Envy24/Envy24HT drivers. # snd_t4dwave: Trident 4DWave DX/NX PCI, Sis 7018 PCI and Acer Labs # M5451 PCI. # snd_uaudio: USB audio. # snd_via8233: VIA VT8233x PCI. # snd_via82c686: VIA VT82C686A PCI. # snd_vibes: S3 Sonicvibes PCI. device snd_ad1816 device snd_als4000 device snd_atiixp #device snd_audiocs device snd_cmi device snd_cs4281 device snd_csa device snd_ds1 device snd_emu10k1 device snd_emu10kx device snd_envy24 device snd_envy24ht device snd_es137x device snd_ess device snd_fm801 device snd_gusc device snd_hda device snd_hdspe device snd_ich device snd_maestro device snd_maestro3 device snd_mss device snd_neomagic device snd_sb16 device snd_sb8 device snd_sbc device snd_solo device snd_spicds device snd_t4dwave device snd_uaudio device snd_via8233 device snd_via82c686 device snd_vibes # For non-PnP sound cards: hint.pcm.0.at="isa" hint.pcm.0.irq="10" hint.pcm.0.drq="1" hint.pcm.0.flags="0x0" hint.sbc.0.at="isa" hint.sbc.0.port="0x220" hint.sbc.0.irq="5" hint.sbc.0.drq="1" hint.sbc.0.flags="0x15" hint.gusc.0.at="isa" hint.gusc.0.port="0x220" hint.gusc.0.irq="5" hint.gusc.0.drq="1" hint.gusc.0.flags="0x13" # # Following options are intended for debugging/testing purposes: # # SND_DEBUG Enable extra debugging code that includes # sanity checking and possible increase of # verbosity. # # SND_DIAGNOSTIC Similar in a spirit of INVARIANTS/DIAGNOSTIC, # zero tolerance against inconsistencies. # # SND_FEEDER_MULTIFORMAT By default, only 16/32 bit feeders are compiled # in. This options enable most feeder converters # except for 8bit. WARNING: May bloat the kernel. # # SND_FEEDER_FULL_MULTIFORMAT Ditto, but includes 8bit feeders as well. # # SND_FEEDER_RATE_HP (feeder_rate) High precision 64bit arithmetic # as much as possible (the default trying to # avoid it). Possible slowdown. # # SND_PCM_64 (Only applicable for i386/32bit arch) # Process 32bit samples through 64bit # integer/arithmetic. Slight increase of dynamic # range at a cost of possible slowdown. # # SND_OLDSTEREO Only 2 channels are allowed, effectively # disabling multichannel processing. # options SND_DEBUG options SND_DIAGNOSTIC options SND_FEEDER_MULTIFORMAT options SND_FEEDER_FULL_MULTIFORMAT options SND_FEEDER_RATE_HP options SND_PCM_64 options SND_OLDSTEREO # # Miscellaneous hardware: # # scd: Sony CD-ROM using proprietary (non-ATAPI) interface # mcd: Mitsumi CD-ROM using proprietary (non-ATAPI) interface # bktr: Brooktree bt848/848a/849a/878/879 video capture and TV Tuner board # joy: joystick (including IO DATA PCJOY PC Card joystick) # cmx: OmniKey CardMan 4040 pccard smartcard reader # Mitsumi CD-ROM device mcd hint.mcd.0.at="isa" hint.mcd.0.port="0x300" # for the Sony CDU31/33A CDROM device scd hint.scd.0.at="isa" hint.scd.0.port="0x230" device joy # PnP aware, hints for non-PnP only hint.joy.0.at="isa" hint.joy.0.port="0x201" device cmx # # The 'bktr' device is a PCI video capture device using the Brooktree # bt848/bt848a/bt849a/bt878/bt879 chipset. When used with a TV Tuner it forms a # TV card, e.g. Miro PC/TV, Hauppauge WinCast/TV WinTV, VideoLogic Captivator, # Intel Smart Video III, AverMedia, IMS Turbo, FlyVideo. # # options OVERRIDE_CARD=xxx # options OVERRIDE_TUNER=xxx # options OVERRIDE_MSP=1 # options OVERRIDE_DBX=1 # These options can be used to override the auto detection # The current values for xxx are found in src/sys/dev/bktr/bktr_card.h # Using sysctl(8) run-time overrides on a per-card basis can be made # # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_PAL # or # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_NTSC # Specifies the default video capture mode. # This is required for Dual Crystal (28&35MHz) boards where PAL is used # to prevent hangs during initialization, e.g. VideoLogic Captivator PCI. # # options BKTR_USE_PLL # This is required for PAL or SECAM boards with a 28MHz crystal and no 35MHz # crystal, e.g. some new Bt878 cards. # # options BKTR_GPIO_ACCESS # This enables IOCTLs which give user level access to the GPIO port. # # options BKTR_NO_MSP_RESET # Prevents the MSP34xx reset. Good if you initialize the MSP in another OS first # # options BKTR_430_FX_MODE # Switch Bt878/879 cards into Intel 430FX chipset compatibility mode. # # options BKTR_SIS_VIA_MODE # Switch Bt878/879 cards into SIS/VIA chipset compatibility mode which is # needed for some old SiS and VIA chipset motherboards. # This also allows Bt878/879 chips to work on old OPTi (<1997) chipset # motherboards and motherboards with bad or incomplete PCI 2.1 support. # As a rough guess, old = before 1998 # # options BKTR_NEW_MSP34XX_DRIVER # Use new, more complete initialization scheme for the msp34* soundchip. # Should fix stereo autodetection if the old driver does only output # mono sound. # # options BKTR_USE_FREEBSD_SMBUS # Compile with FreeBSD SMBus implementation # # Brooktree driver has been ported to the new I2C framework. Thus, # you'll need to have the following 3 lines in the kernel config. # device smbus # device iicbus # device iicbb # device iicsmb # The iic and smb devices are only needed if you want to control other # I2C slaves connected to the external connector of some cards. # device bktr # # PC Card/PCMCIA and Cardbus # # cbb: pci/cardbus bridge implementing YENTA interface # pccard: pccard slots # cardbus: cardbus slots device cbb device pccard device cardbus # # MMC/SD # # mmc MMC/SD bus # mmcsd MMC/SD memory card # sdhci Generic PCI SD Host Controller # device mmc device mmcsd device sdhci # # SMB bus # # System Management Bus support is provided by the 'smbus' device. # Access to the SMBus device is via the 'smb' device (/dev/smb*), # which is a child of the 'smbus' device. # # Supported devices: # smb standard I/O through /dev/smb* # # Supported SMB interfaces: # iicsmb I2C to SMB bridge with any iicbus interface # bktr brooktree848 I2C hardware interface # intpm Intel PIIX4 (82371AB, 82443MX) Power Management Unit # alpm Acer Aladdin-IV/V/Pro2 Power Management Unit # ichsmb Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA) # viapm VIA VT82C586B/596B/686A and VT8233 Power Management Unit # amdpm AMD 756 Power Management Unit # amdsmb AMD 8111 SMBus 2.0 Controller # nfpm NVIDIA nForce Power Management Unit # nfsmb NVIDIA nForce2/3/4 MCP SMBus 2.0 Controller # ismt Intel SMBus 2.0 controller chips (on Atom S1200, C2000) # device smbus # Bus support, required for smb below. device intpm device alpm device ichsmb device viapm device amdpm device amdsmb device nfpm device nfsmb device ismt device smb # # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported devices: # ic i2c network interface # iic i2c standard io # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands. # iicoc simple polling driver for OpenCores I2C controller # # Supported interfaces: # bktr brooktree848 I2C software interface # # Other: # iicbb generic I2C bit-banging code (needed by lpbb, bktr) # device iicbus # Bus support, required for ic/iic/iicsmb below. device iicbb device ic device iic device iicsmb # smb over i2c bridge device iicoc # OpenCores I2C controller support # I2C peripheral devices # # ds133x Dallas Semiconductor DS1337, DS1338 and DS1339 RTC # ds1374 Dallas Semiconductor DS1374 RTC # ds1672 Dallas Semiconductor DS1672 RTC # s35390a Seiko Instruments S-35390A RTC # device ds133x device ds1374 device ds1672 device s35390a # Parallel-Port Bus # # Parallel port bus support is provided by the `ppbus' device. # Multiple devices may be attached to the parallel port, devices # are automatically probed and attached when found. # # Supported devices: # vpo Iomega Zip Drive # Requires SCSI disk support ('scbus' and 'da'), best # performance is achieved with ports in EPP 1.9 mode. # lpt Parallel Printer # plip Parallel network interface # ppi General-purpose I/O ("Geek Port") + IEEE1284 I/O # pps Pulse per second Timing Interface # lpbb Philips official parallel port I2C bit-banging interface # pcfclock Parallel port clock driver. # # Supported interfaces: # ppc ISA-bus parallel port interfaces. # options PPC_PROBE_CHIPSET # Enable chipset specific detection # (see flags in ppc(4)) options DEBUG_1284 # IEEE1284 signaling protocol debug options PERIPH_1284 # Makes your computer act as an IEEE1284 # compliant peripheral options DONTPROBE_1284 # Avoid boot detection of PnP parallel devices options VP0_DEBUG # ZIP/ZIP+ debug options LPT_DEBUG # Printer driver debug options PPC_DEBUG # Parallel chipset level debug options PLIP_DEBUG # Parallel network IP interface debug options PCFCLOCK_VERBOSE # Verbose pcfclock driver options PCFCLOCK_MAX_RETRIES=5 # Maximum read tries (default 10) device ppc hint.ppc.0.at="isa" hint.ppc.0.irq="7" device ppbus device vpo device lpt device plip device ppi device pps device lpbb device pcfclock # # Etherswitch framework and drivers # # etherswitch The etherswitch(4) framework # miiproxy Proxy device for miibus(4) functionality # # Switch hardware support: # arswitch Atheros switches # ip17x IC+ 17x family switches # rtl8366r Realtek RTL8366 switches # ukswitch Multi-PHY switches # device etherswitch device miiproxy device arswitch device ip17x device rtl8366rb device ukswitch # Kernel BOOTP support options BOOTP # Use BOOTP to obtain IP address/hostname # Requires NFSCL and NFS_ROOT options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_NFSV3 # Use NFS v3 to NFS mount root options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP options BOOTP_BLOCKSIZE=8192 # Override NFS block size # # Add software watchdog routines. # options SW_WATCHDOG # # Add the software deadlock resolver thread. # options DEADLKRES # # Disable swapping of stack pages. This option removes all # code which actually performs swapping, so it's not possible to turn # it back on at run-time. # # This is sometimes usable for systems which don't have any swap space # (see also sysctls "vm.defer_swapspace_pageouts" and # "vm.disable_swapspace_pageouts") # #options NO_SWAPPING # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers # for sendfile(2) that are used to map file VM pages, and normally # default to a quantity that is roughly 16*MAXUSERS+512. You would # typically want about 4 of these for each simultaneous file send. # options NSFBUFS=1024 # # Enable extra debugging code for locks. This stores the filename and # line of whatever acquired the lock in the lock itself, and changes a # number of function calls to pass around the relevant data. This is # not at all useful unless you are debugging lock code. Note that # modules should be recompiled as this option modifies KBI. # options DEBUG_LOCKS ##################################################################### # USB support # UHCI controller device uhci # OHCI controller device ohci # EHCI controller device ehci # XHCI controller device xhci # SL811 Controller #device slhci # General USB code (mandatory for USB) device usb # # USB Double Bulk Pipe devices device udbp # USB Fm Radio device ufm # USB temperature meter device ugold # USB LED device uled # Human Interface Device (anything with buttons and dials) device uhid # USB keyboard device ukbd # USB printer device ulpt # USB mass storage driver (Requires scbus and da) device umass # USB mass storage driver for device-side mode device usfs # USB support for Belkin F5U109 and Magic Control Technology serial adapters device umct # USB modem support device umodem # USB mouse device ums # USB touchpad(s) device atp device wsp # eGalax USB touch screen device uep # Diamond Rio 500 MP3 player device urio # # USB serial support device ucom # USB support for 3G modem cards by Option, Novatel, Huawei and Sierra device u3g # USB support for Technologies ARK3116 based serial adapters device uark # USB support for Belkin F5U103 and compatible serial adapters device ubsa # USB support for serial adapters based on the FT8U100AX and FT8U232AM device uftdi # USB support for some Windows CE based serial communication. device uipaq # USB support for Prolific PL-2303 serial adapters device uplcom # USB support for Silicon Laboratories CP2101/CP2102 based USB serial adapters device uslcom # USB Visor and Palm devices device uvisor # USB serial support for DDI pocket's PHS device uvscom # # USB ethernet support device uether # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus # eval board. device aue # ASIX Electronics AX88172 USB 2.0 ethernet driver. Used in the # LinkSys USB200M and various other adapters. device axe # ASIX Electronics AX88178A/AX88179 USB 2.0/3.0 gigabit ethernet driver. device axge # # Devices which communicate using Ethernet over USB, particularly # Communication Device Class (CDC) Ethernet specification. Supports # Sharp Zaurus PDAs, some DOCSIS cable modems and so on. device cdce # # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate # and Netmate II, and the Belkin F5U111. device cue # # Kawasaki LSI ethernet. Supports the LinkSys USB10T, # Entrega USB-NET-E45, Peracom Ethernet Adapter, the # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T, # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB # and 2104USB, and the Corega USB-T. device kue # # RealTek RTL8150 USB to fast ethernet. Supports the Melco LUA-KTX # and the GREEN HOUSE GH-USB100B. device rue # # Davicom DM9601E USB to fast ethernet. Supports the Corega FEther USB-TXC. device udav # # RealTek RTL8152 USB to fast ethernet. device ure # # Moschip MCS7730/MCS7840 USB to fast ethernet. Supports the Sitecom LN030. device mos # # HSxPA devices from Option N.V device uhso # Realtek RTL8188SU/RTL8191SU/RTL8192SU wireless driver device rsu # # Ralink Technology RT2501USB/RT2601USB wireless driver device rum # Ralink Technology RT2700U/RT2800U/RT3000U wireless driver device run # # Atheros AR5523 wireless driver device uath # # Conexant/Intersil PrismGT wireless driver device upgt # # Ralink Technology RT2500USB wireless driver device ural # # RNDIS USB ethernet driver device urndis # Realtek RTL8187B/L wireless driver device urtw # # ZyDas ZD1211/ZD1211B wireless driver device zyd # # Sierra USB wireless driver device usie # # debugging options for the USB subsystem # options USB_DEBUG options U3G_DEBUG # options for ukbd: options UKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions UKBD_DFLT_KEYMAP=jp.pc98 # options for uplcom: options UPLCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds # options for uvscom: options UVSCOM_DEFAULT_OPKTSIZE=8 # default output packet size options UVSCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds ##################################################################### # FireWire support device firewire # FireWire bus code device sbp # SCSI over Firewire (Requires scbus and da) device sbp_targ # SBP-2 Target mode (Requires scbus and targ) device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC2734 and RFC3146) ##################################################################### # dcons support (Dumb Console Device) device dcons # dumb console driver device dcons_crom # FireWire attachment options DCONS_BUF_SIZE=16384 # buffer size options DCONS_POLL_HZ=100 # polling rate options DCONS_FORCE_CONSOLE=0 # force to be the primary console options DCONS_FORCE_GDB=1 # force to be the gdb device ##################################################################### # crypto subsystem # # This is a port of the OpenBSD crypto framework. Include this when # configuring IPSEC and when you have a h/w crypto device to accelerate # user applications that link to OpenSSL. # # Drivers are ports from OpenBSD with some simple enhancements that have # been fed back to OpenBSD. device crypto # core crypto support # Only install the cryptodev device if you are running tests, or know # specifically why you need it. In most cases, it is not needed and # will make things slower. device cryptodev # /dev/crypto for access to h/w device rndtest # FIPS 140-2 entropy tester device hifn # Hifn 7951, 7781, etc. options HIFN_DEBUG # enable debugging support: hw.hifn.debug options HIFN_RNDTEST # enable rndtest support device ubsec # Broadcom 5501, 5601, 58xx options UBSEC_DEBUG # enable debugging support: hw.ubsec.debug options UBSEC_RNDTEST # enable rndtest support ##################################################################### # # Embedded system options: # # An embedded system might want to run something other than init. options INIT_PATH=/sbin/init:/rescue/init # Debug options options BUS_DEBUG # enable newbus debugging options DEBUG_VFS_LOCKS # enable VFS lock debugging options SOCKBUF_DEBUG # enable sockbuf last record/mb tail checking options IFMEDIA_DEBUG # enable debugging in net/if_media.c # # Verbose SYSINIT # # Make the SYSINIT process performed by mi_startup() verbose. This is very # useful when porting to a new architecture. If DDB is also enabled, this # will print function names instead of addresses. options VERBOSE_SYSINIT ##################################################################### # SYSV IPC KERNEL PARAMETERS # # Maximum number of System V semaphores that can be used on the system at # one time. options SEMMNI=11 # Total number of semaphores system wide options SEMMNS=61 # Total number of undo structures in system options SEMMNU=31 # Maximum number of System V semaphores that can be used by a single process # at one time. options SEMMSL=61 # Maximum number of operations that can be outstanding on a single System V # semaphore at one time. options SEMOPM=101 # Maximum number of undo operations that can be outstanding on a single # System V semaphore at one time. options SEMUME=11 # Maximum number of shared memory pages system wide. options SHMALL=1025 # Maximum size, in bytes, of a single System V shared memory region. options SHMMAX=(SHMMAXPGS*PAGE_SIZE+1) options SHMMAXPGS=1025 # Minimum size, in bytes, of a single System V shared memory region. options SHMMIN=2 # Maximum number of shared memory regions that can be used on the system # at one time. options SHMMNI=33 # Maximum number of System V shared memory regions that can be attached to # a single process at one time. options SHMSEG=9 # Set the amount of time (in seconds) the system will wait before # rebooting automatically when a kernel panic occurs. If set to (-1), # the system will wait indefinitely until a key is pressed on the # console. options PANIC_REBOOT_WAIT_TIME=16 # Attempt to bypass the buffer cache and put data directly into the # userland buffer for read operation when O_DIRECT flag is set on the # file. Both offset and length of the read operation must be # multiples of the physical media sector size. # options DIRECTIO # Specify a lower limit for the number of swap I/O buffers. They are # (among other things) used when bypassing the buffer cache due to # DIRECTIO kernel option enabled and O_DIRECT flag set on file. # options NSWBUF_MIN=120 ##################################################################### # More undocumented options for linting. # Note that documenting these is not considered an affront. options CAM_DEBUG_DELAY # VFS cluster debugging. options CLUSTERDEBUG options DEBUG # Kernel filelock debugging. options LOCKF_DEBUG # System V compatible message queues # Please note that the values provided here are used to test kernel # building. The defaults in the sources provide almost the same numbers. # MSGSSZ must be a power of 2 between 8 and 1024. options MSGMNB=2049 # Max number of chars in queue options MSGMNI=41 # Max number of message queue identifiers options MSGSEG=2049 # Max number of message segments options MSGSSZ=16 # Size of a message segment options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers options SCSI_NCR_DEBUG options SCSI_NCR_MAX_SYNC=10000 options SCSI_NCR_MAX_WIDE=1 options SCSI_NCR_MYADDR=7 options SC_DEBUG_LEVEL=5 # Syscons debug level options SC_RENDER_DEBUG # syscons rendering debugging options VFS_BIO_DEBUG # VFS buffer I/O debugging options KSTACK_MAX_PAGES=32 # Maximum pages to give the kernel stack options KSTACK_USAGE_PROF # Adaptec Array Controller driver options options AAC_DEBUG # Debugging levels: # 0 - quiet, only emit warnings # 1 - noisy, emit major function # points and things done # 2 - extremely noisy, emit trace # items in loops, etc. # Resource Accounting options RACCT # Resource Limits options RCTL # Yet more undocumented options for linting. # BKTR_ALLOC_PAGES has no effect except to cause warnings, and # BROOKTREE_ALLOC_PAGES hasn't actually been superseded by it, since the # driver still mostly spells this option BROOKTREE_ALLOC_PAGES. ##options BKTR_ALLOC_PAGES=(217*4+1) options BROOKTREE_ALLOC_PAGES=(217*4+1) options MAXFILES=999 # Random number generator # Only ONE of the below two may be used; they are mutually exclusive. # If neither is present, then the Fortuna algorithm is selected. #options RANDOM_YARROW # Yarrow CSPRNG (old default) #options RANDOM_LOADABLE # Allow the algorithm to be loaded as # a module. # Select this to allow high-rate but potentially expensive # harvesting of Slab-Allocator entropy. In very high-rate # situations the value of doing this is dubious at best. options RANDOM_ENABLE_UMA # slab allocator # Module to enable execution of application via emulators like QEMU options IMAGACT_BINMISC # Intel em(4) driver options EM_MULTIQUEUE # Activate multiqueue features/disable MSI-X # zlib I/O stream support # This enables support for compressed core dumps. options GZIO Index: head/sys/conf/options =================================================================== --- head/sys/conf/options (revision 297747) +++ head/sys/conf/options (revision 297748) @@ -1,970 +1,972 @@ # $FreeBSD$ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affects only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AACRAID_DEBUG opt_aacraid.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h ADW_ALLOW_MEMIO opt_adw.h TWA_DEBUG opt_twa.h TWA_FLASH_FIRMWARE opt_twa.h # Debugging options. ALT_BREAK_TO_DEBUGGER opt_kdb.h BREAK_TO_DEBUGGER opt_kdb.h DDB DDB_BUFR_SIZE opt_ddb.h DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h DDB_CAPTURE_MAXBUFSIZE opt_ddb.h DDB_CTF opt_ddb.h DDB_NUMSYM opt_ddb.h GDB KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h KLD_DEBUG opt_kld.h SYSCTL_DEBUG opt_sysctl.h EARLY_PRINTF opt_global.h TEXTDUMP_PREFERRED opt_ddb.h TEXTDUMP_VERBOSE opt_ddb.h # Miscellaneous options. ADAPTIVE_LOCKMGRS ALQ ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h ATSE_CFI_HACK opt_cfi.h AUDIT opt_global.h BOOTHOWTO opt_global.h BOOTVERBOSE opt_global.h CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h COMPAT_43TTY opt_compat.h COMPAT_FREEBSD4 opt_compat.h COMPAT_FREEBSD5 opt_compat.h COMPAT_FREEBSD6 opt_compat.h COMPAT_FREEBSD7 opt_compat.h COMPAT_FREEBSD9 opt_compat.h COMPAT_FREEBSD10 opt_compat.h COMPAT_CLOUDABI64 opt_dontuse.h COMPAT_LINUXKPI opt_compat.h COMPILING_LINT opt_global.h CY_PCI_FASTINTR DEADLKRES opt_watchdog.h +DEVICE_NUMA EXT_RESOURCES opt_global.h DIRECTIO FILEMON opt_dontuse.h FFCLOCK FULL_PREEMPTION opt_sched.h GZIO opt_gzio.h IMAGACT_BINMISC opt_dontuse.h IPI_PREEMPTION opt_sched.h GEOM_AES opt_geom.h GEOM_BDE opt_geom.h GEOM_BSD opt_geom.h GEOM_CACHE opt_geom.h GEOM_CONCAT opt_geom.h GEOM_ELI opt_geom.h GEOM_FOX opt_geom.h GEOM_GATE opt_geom.h GEOM_JOURNAL opt_geom.h GEOM_LABEL opt_geom.h GEOM_LABEL_GPT opt_geom.h GEOM_LINUX_LVM opt_geom.h GEOM_MAP opt_geom.h GEOM_MBR opt_geom.h GEOM_MIRROR opt_geom.h GEOM_MOUNTVER opt_geom.h GEOM_MULTIPATH opt_geom.h GEOM_NOP opt_geom.h GEOM_PART_APM opt_geom.h GEOM_PART_BSD opt_geom.h GEOM_PART_BSD64 opt_geom.h GEOM_PART_EBR opt_geom.h GEOM_PART_EBR_COMPAT opt_geom.h GEOM_PART_GPT opt_geom.h GEOM_PART_LDM opt_geom.h GEOM_PART_MBR opt_geom.h GEOM_PART_PC98 opt_geom.h GEOM_PART_VTOC8 opt_geom.h GEOM_PC98 opt_geom.h GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h GEOM_SUNLABEL opt_geom.h GEOM_UZIP opt_geom.h GEOM_UZIP_DEBUG opt_geom.h GEOM_VINUM opt_geom.h GEOM_VIRSTOR opt_geom.h GEOM_VOL opt_geom.h GEOM_ZERO opt_geom.h KDTRACE_HOOKS opt_global.h KDTRACE_FRAME opt_kdtrace.h KN_HASHSIZE opt_kqueue.h KSTACK_MAX_PAGES KSTACK_PAGES KSTACK_USAGE_PROF KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC opt_global.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_IFOFF opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_FSTYPE opt_md.h MD_ROOT_SIZE opt_md.h MFI_DEBUG opt_mfi.h MFI_DECODE_LOG opt_mfi.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h NEW_PCIB opt_global.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h NO_ADAPTIVE_RWLOCKS NO_ADAPTIVE_SX NO_EVENTTIMERS opt_timer.h NO_SYSCTL_DESCR opt_global.h NSWBUF_MIN opt_swap.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_IOV opt_global.h PPC_DEBUG opt_ppc.h PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h SPX_HACK STACK opt_stack.h SUIDDIR MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TURNSTILE_PROFILING UMTX_PROFILING VERBOSE_SYSINIT WLCACHE opt_wavelan.h WLDEBUG opt_wavelan.h # POSIX kernel options P1003_1B_MQUEUE opt_posix.h P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. AUTOFS opt_dontuse.h CD9660 opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h FFS opt_dontuse.h FUSE opt_dontuse.h MSDOSFS opt_dontuse.h NANDFS opt_dontuse.h NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h REISERFS opt_dontuse.h SMBFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. # NFSCL - client # NFSD - server NFSCL opt_nfs.h NFSD opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # On small, embedded systems, it can be useful to turn off support for # snapshots. It saves about 30-40k for a feature that would be lightly # used, if it is used at all. NO_FFS_SNAPSHOT opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # Enable gjournal-based UFS journal. UFS_GJOURNAL opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_COMPILE opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_BOOT_DELAY opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/ata/ata_da.c ADA_TEST_FAILURE opt_ada.h ATA_STATIC_ID opt_ada.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_LP_PROBE_MAP opt_sym.h #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in dev/ncr/* SCSI_NCR_DEBUG opt_ncr.h SCSI_NCR_MAX_SYNC opt_ncr.h SCSI_NCR_MAX_WIDE opt_ncr.h SCSI_NCR_MYADDR opt_ncr.h # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h ISP_DEFAULT_ROLES opt_isp.h ISP_INTERNAL_TARGET opt_isp.h # Options used only in dev/iscsi ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_DNS ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_CODEL opt_altq.h ALTQ_DEBUG opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_FAIRQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h BOOTP opt_bootp.h BOOTP_BLOCKSIZE opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h INET opt_inet.h INET6 opt_inet6.h IPDIVERT IPFILTER opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LOOKUP opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_NAT opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPSEC opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_FILTERTUNNEL opt_ipsec.h IPSEC_NAT_T opt_ipsec.h IPSTEALTH KRPC LIBALIAS LIBMBPOOL LIBMCHAIN MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCPPCAP opt_global.h SIFTR TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_RFC7413 opt_inet.h TCP_RFC7413_MAX_KEYS opt_inet.h TCP_SIGNATURE opt_inet.h VLAN_ARRAY opt_vlan.h XBONEHACK FLOWTABLE opt_route.h FLOWTABLE_HASH_ALL opt_route.h # # SCTP # SCTP opt_sctp.h SCTP_DEBUG opt_sctp.h # Enable debug printfs SCTP_WITH_NO_CSUM opt_sctp.h # Use this at your peril SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns. SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats. SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats # # # # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_DEBUG opt_netgraph.h NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_ATMLLC opt_netgraph.h NETGRAPH_ATM_ATMPIF opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CAR opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEFLATE opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_ETHER_ECHO opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_IPFW opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h # MPPC compression requires proprietary files (not included) NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PATCH opt_netgraph.h NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_PRED1 opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TAG opt_netgraph.h NETGRAPH_TCPMSS opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h NETGRAPH_VLAN opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h NGATM_ATMBASE opt_netgraph.h NGATM_SSCOP opt_netgraph.h NGATM_SSCFU opt_netgraph.h NGATM_UNI opt_netgraph.h NGATM_CCATM opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h TI_SF_BUF_JUMBO opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # XXX Conflict: # of devices vs network protocol (Native ATM). # This makes "atm.h" unusable. NATM # DPT driver debug flags DPT_MEASURE_PERFORMANCE opt_dpt.h DPT_RESET_HBA opt_dpt.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. CLUSTERDEBUG opt_debug_cluster.h DEBUG_1284 opt_ppb_1284.h VP0_DEBUG opt_vpo.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h SI_DEBUG opt_debug_si.h IFMEDIA_DEBUG opt_ifmedia.h # Fb options FB_DEBUG opt_fb.h FB_INSTALL_CDEV opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DFLTPHYS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h MAXCPU opt_global.h MAXMEMDOM opt_global.h MAXPHYS opt_global.h MCLSHIFT opt_global.h MUTEX_DEBUG opt_global.h MUTEX_NOINLINE opt_global.h LOCK_PROFILING opt_global.h LOCK_PROFILING_FAST opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RWLOCK_NOINLINE opt_global.h SX_NOINLINE opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h VM_NRESERVLEVEL opt_vm.h +VM_NUMA_ALLOC opt_vm.h VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h MALLOC_DEBUG_MAXZONES opt_vm.h # The MemGuard replacement allocator used for tamper-after-free detection DEBUG_MEMGUARD opt_vm.h # The RedZone malloc(9) protection DEBUG_REDZONE opt_vm.h # Standard SMP options SMP opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_DEBUG opt_nfs.h # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver OVERRIDE_CARD opt_bktr.h OVERRIDE_TUNER opt_bktr.h OVERRIDE_DBX opt_bktr.h OVERRIDE_MSP opt_bktr.h BROOKTREE_SYSTEM_DEFAULT opt_bktr.h BROOKTREE_ALLOC_PAGES opt_bktr.h BKTR_OVERRIDE_CARD opt_bktr.h BKTR_OVERRIDE_TUNER opt_bktr.h BKTR_OVERRIDE_DBX opt_bktr.h BKTR_OVERRIDE_MSP opt_bktr.h BKTR_SYSTEM_DEFAULT opt_bktr.h BKTR_ALLOC_PAGES opt_bktr.h BKTR_USE_PLL opt_bktr.h BKTR_GPIO_ACCESS opt_bktr.h BKTR_NO_MSP_RESET opt_bktr.h BKTR_430_FX_MODE opt_bktr.h BKTR_SIS_VIA_MODE opt_bktr.h BKTR_USE_FREEBSD_SMBUS opt_bktr.h BKTR_NEW_MSP34XX_DRIVER opt_bktr.h # Options for uart(4) UART_PPS_ON_CTS opt_uart.h UART_POLL_FREQ opt_uart.h UART_DEV_TOLERANCE_PCT opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USB_HOST_ALIGN opt_usb.h USB_REQ_DEBUG opt_usb.h USB_TEMPLATE opt_usb.h USB_VERBOSE opt_usb.h USB_DMA_SINGLE_ALLOC opt_usb.h USB_EHCI_BIG_ENDIAN_DESC opt_usb.h U3G_DEBUG opt_u3g.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # options for the Realtek RTL8188*U/RTL8192CU driver (urtwn) URTWN_WITHOUT_UCODE opt_urtwn.h # Embedded system options INIT_PATH ROOTDEVNAME FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_BOOT_ENTRIES opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_NO_VNODE opt_witness.h WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h ACPI_DMAR opt_acpi.h DEV_ACPI opt_acpi.h # ISA support DEV_ISA opt_isa.h ISAPNP opt_isa.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_CARP opt_carp.h DEV_MCA opt_mca.h DEV_NETMAP opt_global.h DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h DEV_RANDOM opt_global.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h # EISA support DEV_EISA opt_eisa.h EISA_SLOTS opt_eisa.h # ed driver ED_HPP opt_ed.h ED_3C503 opt_ed.h ED_SIC opt_ed.h # bce driver BCE_DEBUG opt_bce.h BCE_NVRAM_WRITE_SUPPORT opt_bce.h SOCKBUF_DEBUG opt_global.h # options for ubsec driver UBSEC_DEBUG opt_ubsec.h UBSEC_RNDTEST opt_ubsec.h UBSEC_NO_RNG opt_ubsec.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons/vt options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_MODE_CHANGE opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VT_ALT_TO_ESC_HACK opt_syscons.h VT_FB_DEFAULT_WIDTH opt_syscons.h VT_FB_DEFAULT_HEIGHT opt_syscons.h VT_MAXWINDOWS opt_syscons.h VT_TWOBUTTON_MOUSE opt_syscons.h DEV_SC opt_syscons.h DEV_VT opt_syscons.h # teken terminal emulator options TEKEN_CONS25 opt_teken.h TEKEN_UTF8 opt_teken.h TERMINAL_KERN_ATTR opt_teken.h TERMINAL_NORM_ATTR opt_teken.h # options for printf PRINTF_BUFR_SIZE opt_printf.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h KBDMUX_DFLT_KEYMAP opt_kbdmux.h # options for the Atheros driver ATH_DEBUG opt_ath.h ATH_TXBUF opt_ath.h ATH_RXBUF opt_ath.h ATH_DIAGAPI opt_ath.h ATH_TX99_DIAG opt_ath.h ATH_ENABLE_11N opt_ath.h ATH_ENABLE_DFS opt_ath.h ATH_EEPROM_FIRMWARE opt_ath.h ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h ATH_DEBUG_ALQ opt_ath.h ATH_KTR_INTR_DEBUG opt_ath.h # options for the Atheros hal AH_SUPPORT_AR5416 opt_ah.h # XXX For now, this breaks non-AR9130 chipsets, so only use it # XXX when actually targetting AR9130. AH_SUPPORT_AR9130 opt_ah.h # This is required for AR933x SoC support AH_SUPPORT_AR9330 opt_ah.h AH_SUPPORT_AR9340 opt_ah.h AH_SUPPORT_QCA9530 opt_ah.h AH_SUPPORT_QCA9550 opt_ah.h AH_DEBUG opt_ah.h AH_ASSERT opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_REGOPS_FUNC opt_ah.h AH_WRITE_REGDOMAIN opt_ah.h AH_DEBUG_COUNTRY opt_ah.h AH_WRITE_EEPROM opt_ah.h AH_PRIVATE_DIAG opt_ah.h AH_NEED_DESC_SWAP opt_ah.h AH_USE_INIPDGAIN opt_ah.h AH_MAXCHAN opt_ah.h AH_RXCFG_SDMAMW_4BYTES opt_ah.h AH_INTERRUPT_DEBUGGING opt_ah.h # AR5416 and later interrupt mitigation # XXX do not use this for AR9130 AH_AR5416_INTERRUPT_MITIGATION opt_ah.h # options for the Broadcom BCM43xx driver (bwi) BWI_DEBUG opt_bwi.h BWI_DEBUG_VERBOSE opt_bwi.h # options for the Marvell 8335 wireless driver MALO_DEBUG opt_malo.h MALO_TXBUF opt_malo.h MALO_RXBUF opt_malo.h # options for the Marvell wireless driver MWL_DEBUG opt_mwl.h MWL_TXBUF opt_mwl.h MWL_RXBUF opt_mwl.h MWL_DIAGAPI opt_mwl.h MWL_AGGR_SIZE opt_mwl.h MWL_TX_NODROP opt_mwl.h # Options for the Intel 802.11ac wireless driver IWM_DEBUG opt_iwm.h # Options for the Intel 802.11n wireless driver IWN_DEBUG opt_iwn.h # Options for the Intel 3945ABG wireless driver WPI_DEBUG opt_wpi.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # HWPMC options HWPMC_DEBUG opt_global.h HWPMC_HOOKS HWPMC_MIPS_BACKTRACE opt_hwpmc_hooks.h # XBOX options for FreeBSD/i386, but some files are MI XBOX opt_xbox.h # Interrupt filtering INTR_FILTER # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h IEEE80211_AMPDU_AGE opt_wlan.h IEEE80211_SUPPORT_MESH opt_wlan.h IEEE80211_SUPPORT_SUPERG opt_wlan.h IEEE80211_SUPPORT_TDMA opt_wlan.h IEEE80211_ALQ opt_wlan.h IEEE80211_DFS_DEBUG opt_wlan.h # 802.11 TDMA support TDMA_SLOTLEN_DEFAULT opt_tdma.h TDMA_SLOTCNT_DEFAULT opt_tdma.h TDMA_BINTVAL_DEFAULT opt_tdma.h TDMA_TXRATE_11B_DEFAULT opt_tdma.h TDMA_TXRATE_11G_DEFAULT opt_tdma.h TDMA_TXRATE_11A_DEFAULT opt_tdma.h TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h TDMA_TXRATE_HALF_DEFAULT opt_tdma.h TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h TDMA_TXRATE_11NA_DEFAULT opt_tdma.h TDMA_TXRATE_11NG_DEFAULT opt_tdma.h # VideoMode PICKMODE_DEBUG opt_videomode.h # Network stack virtualization options VIMAGE opt_global.h VNET_DEBUG opt_global.h # Common Flash Interface (CFI) options CFI_SUPPORT_STRATAFLASH opt_cfi.h CFI_ARMEDANDDANGEROUS opt_cfi.h CFI_HARDWAREBYTESWAP opt_cfi.h # Sound options SND_DEBUG opt_snd.h SND_DIAGNOSTIC opt_snd.h SND_FEEDER_MULTIFORMAT opt_snd.h SND_FEEDER_FULL_MULTIFORMAT opt_snd.h SND_FEEDER_RATE_HP opt_snd.h SND_PCM_64 opt_snd.h SND_OLDSTEREO opt_snd.h X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h # OFED Infiniband stack OFED opt_ofed.h OFED_DEBUG_INIT opt_ofed.h SDP opt_ofed.h SDP_DEBUG opt_ofed.h IPOIB opt_ofed.h IPOIB_DEBUG opt_ofed.h IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h # Resource Limits RCTL opt_global.h # Random number generator(s) # Which CSPRNG hash we get. # If Yarrow is not chosen, Fortuna is selected. RANDOM_YARROW opt_global.h # With this, no entropy processor is loaded, but the entropy # harvesting infrastructure is present. This means an entropy # processor may be loaded as a module. RANDOM_LOADABLE opt_global.h # This turns on high-rate and potentially expensive harvesting in # the uma slab allocator. RANDOM_ENABLE_UMA opt_global.h # Intel em(4) driver EM_MULTIQUEUE opt_em.h Index: head/sys/dev/acpica/acpi.c =================================================================== --- head/sys/dev/acpica/acpi.c (revision 297747) +++ head/sys/dev/acpica/acpi.c (revision 297748) @@ -1,4057 +1,4059 @@ /*- * Copyright (c) 2000 Takanori Watanabe * Copyright (c) 2000 Mitsuru IWASAKI * Copyright (c) 2000, 2001 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" +#include "opt_device_numa.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_ACPIDEV, "acpidev", "ACPI devices"); /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_BUS ACPI_MODULE_NAME("ACPI") static d_open_t acpiopen; static d_close_t acpiclose; static d_ioctl_t acpiioctl; static struct cdevsw acpi_cdevsw = { .d_version = D_VERSION, .d_open = acpiopen, .d_close = acpiclose, .d_ioctl = acpiioctl, .d_name = "acpi", }; struct acpi_interface { ACPI_STRING *data; int num; }; /* Global mutex for locking access to the ACPI subsystem. */ struct mtx acpi_mutex; struct callout acpi_sleep_timer; /* Bitmap of device quirks. */ int acpi_quirks; /* Supported sleep states. */ static BOOLEAN acpi_sleep_states[ACPI_S_STATE_COUNT]; static void acpi_lookup(void *arg, const char *name, device_t *dev); static int acpi_modevent(struct module *mod, int event, void *junk); static int acpi_probe(device_t dev); static int acpi_attach(device_t dev); static int acpi_suspend(device_t dev); static int acpi_resume(device_t dev); static int acpi_shutdown(device_t dev); static device_t acpi_add_child(device_t bus, u_int order, const char *name, int unit); static int acpi_print_child(device_t bus, device_t child); static void acpi_probe_nomatch(device_t bus, device_t child); static void acpi_driver_added(device_t dev, driver_t *driver); static int acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result); static int acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value); static struct resource_list *acpi_get_rlist(device_t dev, device_t child); static void acpi_reserve_resources(device_t dev); static int acpi_sysres_alloc(device_t dev); static int acpi_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count); static struct resource *acpi_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end); static int acpi_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r); static void acpi_delete_resource(device_t bus, device_t child, int type, int rid); static uint32_t acpi_isa_get_logicalid(device_t dev); static int acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count); static char *acpi_device_id_probe(device_t bus, device_t dev, char **ids); static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret); static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *context, void **retval); static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev, int max_depth, acpi_scan_cb_t user_fn, void *arg); static int acpi_set_powerstate(device_t child, int state); static int acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids); static void acpi_probe_children(device_t bus); static void acpi_probe_order(ACPI_HANDLE handle, int *order); static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status); static void acpi_sleep_enable(void *arg); static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc); static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state); static void acpi_shutdown_final(void *arg, int howto); static void acpi_enable_fixed_events(struct acpi_softc *sc); static BOOLEAN acpi_has_hid(ACPI_HANDLE handle); static void acpi_resync_clock(struct acpi_softc *sc); static int acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate); static int acpi_wake_run_prep(ACPI_HANDLE handle, int sstate); static int acpi_wake_prep_walk(int sstate); static int acpi_wake_sysctl_walk(device_t dev); static int acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS); static void acpi_system_eventhandler_sleep(void *arg, int state); static void acpi_system_eventhandler_wakeup(void *arg, int state); static int acpi_sname2sstate(const char *sname); static const char *acpi_sstate2sname(int sstate); static int acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_pm_func(u_long cmd, void *arg, ...); static int acpi_child_location_str_method(device_t acdev, device_t child, char *buf, size_t buflen); static int acpi_child_pnpinfo_str_method(device_t acdev, device_t child, char *buf, size_t buflen); #if defined(__i386__) || defined(__amd64__) static void acpi_enable_pcie(void); #endif static void acpi_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp); static void acpi_reset_interfaces(device_t dev); static device_method_t acpi_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_probe), DEVMETHOD(device_attach, acpi_attach), DEVMETHOD(device_shutdown, acpi_shutdown), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_suspend, acpi_suspend), DEVMETHOD(device_resume, acpi_resume), /* Bus interface */ DEVMETHOD(bus_add_child, acpi_add_child), DEVMETHOD(bus_print_child, acpi_print_child), DEVMETHOD(bus_probe_nomatch, acpi_probe_nomatch), DEVMETHOD(bus_driver_added, acpi_driver_added), DEVMETHOD(bus_read_ivar, acpi_read_ivar), DEVMETHOD(bus_write_ivar, acpi_write_ivar), DEVMETHOD(bus_get_resource_list, acpi_get_rlist), DEVMETHOD(bus_set_resource, acpi_set_resource), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_alloc_resource, acpi_alloc_resource), DEVMETHOD(bus_adjust_resource, acpi_adjust_resource), DEVMETHOD(bus_release_resource, acpi_release_resource), DEVMETHOD(bus_delete_resource, acpi_delete_resource), DEVMETHOD(bus_child_pnpinfo_str, acpi_child_pnpinfo_str_method), DEVMETHOD(bus_child_location_str, acpi_child_location_str_method), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_hint_device_unit, acpi_hint_device_unit), DEVMETHOD(bus_get_domain, acpi_get_domain), /* ACPI bus */ DEVMETHOD(acpi_id_probe, acpi_device_id_probe), DEVMETHOD(acpi_evaluate_object, acpi_device_eval_obj), DEVMETHOD(acpi_pwr_for_sleep, acpi_device_pwr_for_sleep), DEVMETHOD(acpi_scan_children, acpi_device_scan_children), /* ISA emulation */ DEVMETHOD(isa_pnp_probe, acpi_isa_pnp_probe), DEVMETHOD_END }; static driver_t acpi_driver = { "acpi", acpi_methods, sizeof(struct acpi_softc), }; static devclass_t acpi_devclass; DRIVER_MODULE(acpi, nexus, acpi_driver, acpi_devclass, acpi_modevent, 0); MODULE_VERSION(acpi, 1); ACPI_SERIAL_DECL(acpi, "ACPI root bus"); /* Local pools for managing system resources for ACPI child devices. */ static struct rman acpi_rman_io, acpi_rman_mem; #define ACPI_MINIMUM_AWAKETIME 5 /* Holds the description of the acpi0 device. */ static char acpi_desc[ACPI_OEM_ID_SIZE + ACPI_OEM_TABLE_ID_SIZE + 2]; SYSCTL_NODE(_debug, OID_AUTO, acpi, CTLFLAG_RD, NULL, "ACPI debugging"); static char acpi_ca_version[12]; SYSCTL_STRING(_debug_acpi, OID_AUTO, acpi_ca_version, CTLFLAG_RD, acpi_ca_version, 0, "Version of Intel ACPI-CA"); /* * Allow overriding _OSI methods. */ static char acpi_install_interface[256]; TUNABLE_STR("hw.acpi.install_interface", acpi_install_interface, sizeof(acpi_install_interface)); static char acpi_remove_interface[256]; TUNABLE_STR("hw.acpi.remove_interface", acpi_remove_interface, sizeof(acpi_remove_interface)); /* Allow users to dump Debug objects without ACPI debugger. */ static int acpi_debug_objects; TUNABLE_INT("debug.acpi.enable_debug_objects", &acpi_debug_objects); SYSCTL_PROC(_debug_acpi, OID_AUTO, enable_debug_objects, CTLFLAG_RW | CTLTYPE_INT, NULL, 0, acpi_debug_objects_sysctl, "I", "Enable Debug objects"); /* Allow the interpreter to ignore common mistakes in BIOS. */ static int acpi_interpreter_slack = 1; TUNABLE_INT("debug.acpi.interpreter_slack", &acpi_interpreter_slack); SYSCTL_INT(_debug_acpi, OID_AUTO, interpreter_slack, CTLFLAG_RDTUN, &acpi_interpreter_slack, 1, "Turn on interpreter slack mode."); /* Ignore register widths set by FADT and use default widths instead. */ static int acpi_ignore_reg_width = 1; TUNABLE_INT("debug.acpi.default_register_width", &acpi_ignore_reg_width); SYSCTL_INT(_debug_acpi, OID_AUTO, default_register_width, CTLFLAG_RDTUN, &acpi_ignore_reg_width, 1, "Ignore register widths set by FADT"); #ifdef __amd64__ /* Reset system clock while resuming. XXX Remove once tested. */ static int acpi_reset_clock = 1; TUNABLE_INT("debug.acpi.reset_clock", &acpi_reset_clock); SYSCTL_INT(_debug_acpi, OID_AUTO, reset_clock, CTLFLAG_RW, &acpi_reset_clock, 1, "Reset system clock while resuming."); #endif /* Allow users to override quirks. */ TUNABLE_INT("debug.acpi.quirks", &acpi_quirks); static int acpi_susp_bounce; SYSCTL_INT(_debug_acpi, OID_AUTO, suspend_bounce, CTLFLAG_RW, &acpi_susp_bounce, 0, "Don't actually suspend, just test devices."); /* * ACPI can only be loaded as a module by the loader; activating it after * system bootstrap time is not useful, and can be fatal to the system. * It also cannot be unloaded, since the entire system bus hierarchy hangs * off it. */ static int acpi_modevent(struct module *mod, int event, void *junk) { switch (event) { case MOD_LOAD: if (!cold) { printf("The ACPI driver cannot be loaded after boot.\n"); return (EPERM); } break; case MOD_UNLOAD: if (!cold && power_pm_get_type() == POWER_PM_TYPE_ACPI) return (EBUSY); break; default: break; } return (0); } /* * Perform early initialization. */ ACPI_STATUS acpi_Startup(void) { static int started = 0; ACPI_STATUS status; int val; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Only run the startup code once. The MADT driver also calls this. */ if (started) return_VALUE (AE_OK); started = 1; /* * Pre-allocate space for RSDT/XSDT and DSDT tables and allow resizing * if more tables exist. */ if (ACPI_FAILURE(status = AcpiInitializeTables(NULL, 2, TRUE))) { printf("ACPI: Table initialisation failed: %s\n", AcpiFormatException(status)); return_VALUE (status); } /* Set up any quirks we have for this system. */ if (acpi_quirks == ACPI_Q_OK) acpi_table_quirks(&acpi_quirks); /* If the user manually set the disabled hint to 0, force-enable ACPI. */ if (resource_int_value("acpi", 0, "disabled", &val) == 0 && val == 0) acpi_quirks &= ~ACPI_Q_BROKEN; if (acpi_quirks & ACPI_Q_BROKEN) { printf("ACPI disabled by blacklist. Contact your BIOS vendor.\n"); status = AE_SUPPORT; } return_VALUE (status); } /* * Detect ACPI and perform early initialisation. */ int acpi_identify(void) { ACPI_TABLE_RSDP *rsdp; ACPI_TABLE_HEADER *rsdt; ACPI_PHYSICAL_ADDRESS paddr; struct sbuf sb; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (!cold) return (ENXIO); /* Check that we haven't been disabled with a hint. */ if (resource_disabled("acpi", 0)) return (ENXIO); /* Check for other PM systems. */ if (power_pm_get_type() != POWER_PM_TYPE_NONE && power_pm_get_type() != POWER_PM_TYPE_ACPI) { printf("ACPI identify failed, other PM system enabled.\n"); return (ENXIO); } /* Initialize root tables. */ if (ACPI_FAILURE(acpi_Startup())) { printf("ACPI: Try disabling either ACPI or apic support.\n"); return (ENXIO); } if ((paddr = AcpiOsGetRootPointer()) == 0 || (rsdp = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_RSDP))) == NULL) return (ENXIO); if (rsdp->Revision > 1 && rsdp->XsdtPhysicalAddress != 0) paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->XsdtPhysicalAddress; else paddr = (ACPI_PHYSICAL_ADDRESS)rsdp->RsdtPhysicalAddress; AcpiOsUnmapMemory(rsdp, sizeof(ACPI_TABLE_RSDP)); if ((rsdt = AcpiOsMapMemory(paddr, sizeof(ACPI_TABLE_HEADER))) == NULL) return (ENXIO); sbuf_new(&sb, acpi_desc, sizeof(acpi_desc), SBUF_FIXEDLEN); sbuf_bcat(&sb, rsdt->OemId, ACPI_OEM_ID_SIZE); sbuf_trim(&sb); sbuf_putc(&sb, ' '); sbuf_bcat(&sb, rsdt->OemTableId, ACPI_OEM_TABLE_ID_SIZE); sbuf_trim(&sb); sbuf_finish(&sb); sbuf_delete(&sb); AcpiOsUnmapMemory(rsdt, sizeof(ACPI_TABLE_HEADER)); snprintf(acpi_ca_version, sizeof(acpi_ca_version), "%x", ACPI_CA_VERSION); return (0); } /* * Fetch some descriptive data from ACPI to put in our attach message. */ static int acpi_probe(device_t dev) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); device_set_desc(dev, acpi_desc); return_VALUE (BUS_PROBE_NOWILDCARD); } static int acpi_attach(device_t dev) { struct acpi_softc *sc; ACPI_STATUS status; int error, state; UINT32 flags; UINT8 TypeA, TypeB; char *env; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = device_get_softc(dev); sc->acpi_dev = dev; callout_init(&sc->susp_force_to, 1); error = ENXIO; /* Initialize resource manager. */ acpi_rman_io.rm_type = RMAN_ARRAY; acpi_rman_io.rm_start = 0; acpi_rman_io.rm_end = 0xffff; acpi_rman_io.rm_descr = "ACPI I/O ports"; if (rman_init(&acpi_rman_io) != 0) panic("acpi rman_init IO ports failed"); acpi_rman_mem.rm_type = RMAN_ARRAY; acpi_rman_mem.rm_descr = "ACPI I/O memory addresses"; if (rman_init(&acpi_rman_mem) != 0) panic("acpi rman_init memory failed"); /* Initialise the ACPI mutex */ mtx_init(&acpi_mutex, "ACPI global lock", NULL, MTX_DEF); /* * Set the globals from our tunables. This is needed because ACPI-CA * uses UINT8 for some values and we have no tunable_byte. */ AcpiGbl_EnableInterpreterSlack = acpi_interpreter_slack ? TRUE : FALSE; AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE; AcpiGbl_UseDefaultRegisterWidths = acpi_ignore_reg_width ? TRUE : FALSE; #ifndef ACPI_DEBUG /* * Disable all debugging layers and levels. */ AcpiDbgLayer = 0; AcpiDbgLevel = 0; #endif /* Start up the ACPI CA subsystem. */ status = AcpiInitializeSubsystem(); if (ACPI_FAILURE(status)) { device_printf(dev, "Could not initialize Subsystem: %s\n", AcpiFormatException(status)); goto out; } /* Override OS interfaces if the user requested. */ acpi_reset_interfaces(dev); /* Load ACPI name space. */ status = AcpiLoadTables(); if (ACPI_FAILURE(status)) { device_printf(dev, "Could not load Namespace: %s\n", AcpiFormatException(status)); goto out; } #if defined(__i386__) || defined(__amd64__) /* Handle MCFG table if present. */ acpi_enable_pcie(); #endif /* * Note that some systems (specifically, those with namespace evaluation * issues that require the avoidance of parts of the namespace) must * avoid running _INI and _STA on everything, as well as dodging the final * object init pass. * * For these devices, we set ACPI_NO_DEVICE_INIT and ACPI_NO_OBJECT_INIT). * * XXX We should arrange for the object init pass after we have attached * all our child devices, but on many systems it works here. */ flags = 0; if (testenv("debug.acpi.avoid")) flags = ACPI_NO_DEVICE_INIT | ACPI_NO_OBJECT_INIT; /* Bring the hardware and basic handlers online. */ if (ACPI_FAILURE(status = AcpiEnableSubsystem(flags))) { device_printf(dev, "Could not enable ACPI: %s\n", AcpiFormatException(status)); goto out; } /* * Call the ECDT probe function to provide EC functionality before * the namespace has been evaluated. * * XXX This happens before the sysresource devices have been probed and * attached so its resources come from nexus0. In practice, this isn't * a problem but should be addressed eventually. */ acpi_ec_ecdt_probe(dev); /* Bring device objects and regions online. */ if (ACPI_FAILURE(status = AcpiInitializeObjects(flags))) { device_printf(dev, "Could not initialize ACPI objects: %s\n", AcpiFormatException(status)); goto out; } /* * Setup our sysctl tree. * * XXX: This doesn't check to make sure that none of these fail. */ sysctl_ctx_init(&sc->acpi_sysctl_ctx); sc->acpi_sysctl_tree = SYSCTL_ADD_NODE(&sc->acpi_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, device_get_name(dev), CTLFLAG_RD, 0, ""); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "supported_sleep_state", CTLTYPE_STRING | CTLFLAG_RD, 0, 0, acpi_supported_sleep_state_sysctl, "A", "List supported ACPI sleep states."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "power_button_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_power_button_sx, 0, acpi_sleep_state_sysctl, "A", "Power button ACPI sleep state."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "sleep_button_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_sleep_button_sx, 0, acpi_sleep_state_sysctl, "A", "Sleep button ACPI sleep state."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "lid_switch_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_lid_switch_sx, 0, acpi_sleep_state_sysctl, "A", "Lid ACPI sleep state. Set to S3 if you want to suspend your laptop when close the Lid."); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "standby_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_standby_sx, 0, acpi_sleep_state_sysctl, "A", ""); SYSCTL_ADD_PROC(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "suspend_state", CTLTYPE_STRING | CTLFLAG_RW, &sc->acpi_suspend_sx, 0, acpi_sleep_state_sysctl, "A", ""); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "sleep_delay", CTLFLAG_RW, &sc->acpi_sleep_delay, 0, "sleep delay in seconds"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "s4bios", CTLFLAG_RW, &sc->acpi_s4bios, 0, "S4BIOS mode"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "verbose", CTLFLAG_RW, &sc->acpi_verbose, 0, "verbose mode"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "disable_on_reboot", CTLFLAG_RW, &sc->acpi_do_disable, 0, "Disable ACPI when rebooting/halting system"); SYSCTL_ADD_INT(&sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(sc->acpi_sysctl_tree), OID_AUTO, "handle_reboot", CTLFLAG_RW, &sc->acpi_handle_reboot, 0, "Use ACPI Reset Register to reboot"); /* * Default to 1 second before sleeping to give some machines time to * stabilize. */ sc->acpi_sleep_delay = 1; if (bootverbose) sc->acpi_verbose = 1; if ((env = kern_getenv("hw.acpi.verbose")) != NULL) { if (strcmp(env, "0") != 0) sc->acpi_verbose = 1; freeenv(env); } /* Only enable reboot by default if the FADT says it is available. */ if (AcpiGbl_FADT.Flags & ACPI_FADT_RESET_REGISTER) sc->acpi_handle_reboot = 1; #if !ACPI_REDUCED_HARDWARE /* Only enable S4BIOS by default if the FACS says it is available. */ if (AcpiGbl_FACS != NULL && AcpiGbl_FACS->Flags & ACPI_FACS_S4_BIOS_PRESENT) sc->acpi_s4bios = 1; #endif /* Probe all supported sleep states. */ acpi_sleep_states[ACPI_STATE_S0] = TRUE; for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++) if (ACPI_SUCCESS(AcpiEvaluateObject(ACPI_ROOT_OBJECT, __DECONST(char *, AcpiGbl_SleepStateNames[state]), NULL, NULL)) && ACPI_SUCCESS(AcpiGetSleepTypeData(state, &TypeA, &TypeB))) acpi_sleep_states[state] = TRUE; /* * Dispatch the default sleep state to devices. The lid switch is set * to UNKNOWN by default to avoid surprising users. */ sc->acpi_power_button_sx = acpi_sleep_states[ACPI_STATE_S5] ? ACPI_STATE_S5 : ACPI_STATE_UNKNOWN; sc->acpi_lid_switch_sx = ACPI_STATE_UNKNOWN; sc->acpi_standby_sx = acpi_sleep_states[ACPI_STATE_S1] ? ACPI_STATE_S1 : ACPI_STATE_UNKNOWN; sc->acpi_suspend_sx = acpi_sleep_states[ACPI_STATE_S3] ? ACPI_STATE_S3 : ACPI_STATE_UNKNOWN; /* Pick the first valid sleep state for the sleep button default. */ sc->acpi_sleep_button_sx = ACPI_STATE_UNKNOWN; for (state = ACPI_STATE_S1; state <= ACPI_STATE_S4; state++) if (acpi_sleep_states[state]) { sc->acpi_sleep_button_sx = state; break; } acpi_enable_fixed_events(sc); /* * Scan the namespace and attach/initialise children. */ /* Register our shutdown handler. */ EVENTHANDLER_REGISTER(shutdown_final, acpi_shutdown_final, sc, SHUTDOWN_PRI_LAST); /* * Register our acpi event handlers. * XXX should be configurable eg. via userland policy manager. */ EVENTHANDLER_REGISTER(acpi_sleep_event, acpi_system_eventhandler_sleep, sc, ACPI_EVENT_PRI_LAST); EVENTHANDLER_REGISTER(acpi_wakeup_event, acpi_system_eventhandler_wakeup, sc, ACPI_EVENT_PRI_LAST); /* Flag our initial states. */ sc->acpi_enabled = TRUE; sc->acpi_sstate = ACPI_STATE_S0; sc->acpi_sleep_disabled = TRUE; /* Create the control device */ sc->acpi_dev_t = make_dev(&acpi_cdevsw, 0, UID_ROOT, GID_WHEEL, 0644, "acpi"); sc->acpi_dev_t->si_drv1 = sc; if ((error = acpi_machdep_init(dev))) goto out; /* Register ACPI again to pass the correct argument of pm_func. */ power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, sc); if (!acpi_disabled("bus")) { EVENTHANDLER_REGISTER(dev_lookup, acpi_lookup, NULL, 1000); acpi_probe_children(dev); } /* Update all GPEs and enable runtime GPEs. */ status = AcpiUpdateAllGpes(); if (ACPI_FAILURE(status)) device_printf(dev, "Could not update all GPEs: %s\n", AcpiFormatException(status)); /* Allow sleep request after a while. */ callout_init_mtx(&acpi_sleep_timer, &acpi_mutex, 0); callout_reset(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME, acpi_sleep_enable, sc); error = 0; out: return_VALUE (error); } static void acpi_set_power_children(device_t dev, int state) { device_t child; device_t *devlist; int dstate, i, numdevs; if (device_get_children(dev, &devlist, &numdevs) != 0) return; /* * Retrieve and set D-state for the sleep state if _SxD is present. * Skip children who aren't attached since they are handled separately. */ for (i = 0; i < numdevs; i++) { child = devlist[i]; dstate = state; if (device_is_attached(child) && acpi_device_pwr_for_sleep(dev, child, &dstate) == 0) acpi_set_powerstate(child, dstate); } free(devlist, M_TEMP); } static int acpi_suspend(device_t dev) { int error; GIANT_REQUIRED; error = bus_generic_suspend(dev); if (error == 0) acpi_set_power_children(dev, ACPI_STATE_D3); return (error); } static int acpi_resume(device_t dev) { GIANT_REQUIRED; acpi_set_power_children(dev, ACPI_STATE_D0); return (bus_generic_resume(dev)); } static int acpi_shutdown(device_t dev) { GIANT_REQUIRED; /* Allow children to shutdown first. */ bus_generic_shutdown(dev); /* * Enable any GPEs that are able to power-on the system (i.e., RTC). * Also, disable any that are not valid for this state (most). */ acpi_wake_prep_walk(ACPI_STATE_S5); return (0); } /* * Handle a new device being added */ static device_t acpi_add_child(device_t bus, u_int order, const char *name, int unit) { struct acpi_device *ad; device_t child; if ((ad = malloc(sizeof(*ad), M_ACPIDEV, M_NOWAIT | M_ZERO)) == NULL) return (NULL); resource_list_init(&ad->ad_rl); child = device_add_child_ordered(bus, order, name, unit); if (child != NULL) device_set_ivars(child, ad); else free(ad, M_ACPIDEV); return (child); } static int acpi_print_child(device_t bus, device_t child) { struct acpi_device *adev = device_get_ivars(child); struct resource_list *rl = &adev->ad_rl; int retval = 0; retval += bus_print_child_header(bus, child); retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx"); retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); retval += resource_list_print_type(rl, "drq", SYS_RES_DRQ, "%jd"); if (device_get_flags(child)) retval += printf(" flags %#x", device_get_flags(child)); retval += bus_print_child_domain(bus, child); retval += bus_print_child_footer(bus, child); return (retval); } /* * If this device is an ACPI child but no one claimed it, attempt * to power it off. We'll power it back up when a driver is added. * * XXX Disabled for now since many necessary devices (like fdc and * ATA) don't claim the devices we created for them but still expect * them to be powered up. */ static void acpi_probe_nomatch(device_t bus, device_t child) { #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER acpi_set_powerstate(child, ACPI_STATE_D3); #endif } /* * If a new driver has a chance to probe a child, first power it up. * * XXX Disabled for now (see acpi_probe_nomatch for details). */ static void acpi_driver_added(device_t dev, driver_t *driver) { device_t child, *devlist; int i, numdevs; DEVICE_IDENTIFY(driver, dev); if (device_get_children(dev, &devlist, &numdevs)) return; for (i = 0; i < numdevs; i++) { child = devlist[i]; if (device_get_state(child) == DS_NOTPRESENT) { #ifdef ACPI_ENABLE_POWERDOWN_NODRIVER acpi_set_powerstate(child, ACPI_STATE_D0); if (device_probe_and_attach(child) != 0) acpi_set_powerstate(child, ACPI_STATE_D3); #else device_probe_and_attach(child); #endif } } free(devlist, M_TEMP); } /* Location hint for devctl(8) */ static int acpi_child_location_str_method(device_t cbdev, device_t child, char *buf, size_t buflen) { struct acpi_device *dinfo = device_get_ivars(child); char buf2[32]; int pxm; if (dinfo->ad_handle) { snprintf(buf, buflen, "handle=%s", acpi_name(dinfo->ad_handle)); if (ACPI_SUCCESS(acpi_GetInteger(dinfo->ad_handle, "_PXM", &pxm))) { snprintf(buf2, 32, " _PXM=%d", pxm); strlcat(buf, buf2, buflen); } } else { snprintf(buf, buflen, "unknown"); } return (0); } /* PnP information for devctl(8) */ static int acpi_child_pnpinfo_str_method(device_t cbdev, device_t child, char *buf, size_t buflen) { struct acpi_device *dinfo = device_get_ivars(child); ACPI_DEVICE_INFO *adinfo; if (ACPI_FAILURE(AcpiGetObjectInfo(dinfo->ad_handle, &adinfo))) { snprintf(buf, buflen, "unknown"); return (0); } snprintf(buf, buflen, "_HID=%s _UID=%lu", (adinfo->Valid & ACPI_VALID_HID) ? adinfo->HardwareId.String : "none", (adinfo->Valid & ACPI_VALID_UID) ? strtoul(adinfo->UniqueId.String, NULL, 10) : 0UL); AcpiOsFree(adinfo); return (0); } /* * Handle per-device ivars */ static int acpi_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct acpi_device *ad; if ((ad = device_get_ivars(child)) == NULL) { device_printf(child, "device has no ivars\n"); return (ENOENT); } /* ACPI and ISA compatibility ivars */ switch(index) { case ACPI_IVAR_HANDLE: *(ACPI_HANDLE *)result = ad->ad_handle; break; case ACPI_IVAR_PRIVATE: *(void **)result = ad->ad_private; break; case ACPI_IVAR_FLAGS: *(int *)result = ad->ad_flags; break; case ISA_IVAR_VENDORID: case ISA_IVAR_SERIAL: case ISA_IVAR_COMPATID: *(int *)result = -1; break; case ISA_IVAR_LOGICALID: *(int *)result = acpi_isa_get_logicalid(child); break; default: return (ENOENT); } return (0); } static int acpi_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { struct acpi_device *ad; if ((ad = device_get_ivars(child)) == NULL) { device_printf(child, "device has no ivars\n"); return (ENOENT); } switch(index) { case ACPI_IVAR_HANDLE: ad->ad_handle = (ACPI_HANDLE)value; break; case ACPI_IVAR_PRIVATE: ad->ad_private = (void *)value; break; case ACPI_IVAR_FLAGS: ad->ad_flags = (int)value; break; default: panic("bad ivar write request (%d)", index); return (ENOENT); } return (0); } /* * Handle child resource allocation/removal */ static struct resource_list * acpi_get_rlist(device_t dev, device_t child) { struct acpi_device *ad; ad = device_get_ivars(child); return (&ad->ad_rl); } static int acpi_match_resource_hint(device_t dev, int type, long value) { struct acpi_device *ad = device_get_ivars(dev); struct resource_list *rl = &ad->ad_rl; struct resource_list_entry *rle; STAILQ_FOREACH(rle, rl, link) { if (rle->type != type) continue; if (rle->start <= value && rle->end >= value) return (1); } return (0); } /* * Wire device unit numbers based on resource matches in hints. */ static void acpi_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp) { const char *s; long value; int line, matches, unit; /* * Iterate over all the hints for the devices with the specified * name to see if one's resources are a subset of this device. */ line = 0; for (;;) { if (resource_find_dev(&line, name, &unit, "at", NULL) != 0) break; /* Must have an "at" for acpi or isa. */ resource_string_value(name, unit, "at", &s); if (!(strcmp(s, "acpi0") == 0 || strcmp(s, "acpi") == 0 || strcmp(s, "isa0") == 0 || strcmp(s, "isa") == 0)) continue; /* * Check for matching resources. We must have at least one match. * Since I/O and memory resources cannot be shared, if we get a * match on either of those, ignore any mismatches in IRQs or DRQs. * * XXX: We may want to revisit this to be more lenient and wire * as long as it gets one match. */ matches = 0; if (resource_long_value(name, unit, "port", &value) == 0) { /* * Floppy drive controllers are notorious for having a * wide variety of resources not all of which include the * first port that is specified by the hint (typically * 0x3f0) (see the comment above fdc_isa_alloc_resources() * in fdc_isa.c). However, they do all seem to include * port + 2 (e.g. 0x3f2) so for a floppy device, look for * 'value + 2' in the port resources instead of the hint * value. */ if (strcmp(name, "fdc") == 0) value += 2; if (acpi_match_resource_hint(child, SYS_RES_IOPORT, value)) matches++; else continue; } if (resource_long_value(name, unit, "maddr", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_MEMORY, value)) matches++; else continue; } if (matches > 0) goto matched; if (resource_long_value(name, unit, "irq", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_IRQ, value)) matches++; else continue; } if (resource_long_value(name, unit, "drq", &value) == 0) { if (acpi_match_resource_hint(child, SYS_RES_DRQ, value)) matches++; else continue; } matched: if (matches > 0) { /* We have a winner! */ *unitp = unit; break; } } } /* * Fetch the VM domain for the given device 'dev'. * * Return 1 + domain if there's a domain, 0 if not found; * -1 upon an error. */ int acpi_parse_pxm(device_t dev, int *domain) { -#if MAXMEMDOM > 1 +#ifdef DEVICE_NUMA ACPI_HANDLE h; int d, pxm; h = acpi_get_handle(dev); if ((h != NULL) && ACPI_SUCCESS(acpi_GetInteger(h, "_PXM", &pxm))) { d = acpi_map_pxm_to_vm_domainid(pxm); if (d < 0) return (-1); *domain = d; return (1); } #endif return (0); } /* * Fetch the NUMA domain for the given device. * * If a device has a _PXM method, map that to a NUMA domain. * * If none is found, then it'll call the parent method. * If there's no domain, return ENOENT. */ int acpi_get_domain(device_t dev, device_t child, int *domain) { int ret; ret = acpi_parse_pxm(child, domain); /* Error */ if (ret == -1) return (ENOENT); /* Found */ if (ret == 1) return (0); /* No _PXM node; go up a level */ return (bus_generic_get_domain(dev, child, domain)); } /* * Pre-allocate/manage all memory and IO resources. Since rman can't handle * duplicates, we merge any in the sysresource attach routine. */ static int acpi_sysres_alloc(device_t dev) { struct resource *res; struct resource_list *rl; struct resource_list_entry *rle; struct rman *rm; char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL }; device_t *children; int child_count, i; /* * Probe/attach any sysresource devices. This would be unnecessary if we * had multi-pass probe/attach. */ if (device_get_children(dev, &children, &child_count) != 0) return (ENXIO); for (i = 0; i < child_count; i++) { if (ACPI_ID_PROBE(dev, children[i], sysres_ids) != NULL) device_probe_and_attach(children[i]); } free(children, M_TEMP); rl = BUS_GET_RESOURCE_LIST(device_get_parent(dev), dev); STAILQ_FOREACH(rle, rl, link) { if (rle->res != NULL) { device_printf(dev, "duplicate resource for %jx\n", rle->start); continue; } /* Only memory and IO resources are valid here. */ switch (rle->type) { case SYS_RES_IOPORT: rm = &acpi_rman_io; break; case SYS_RES_MEMORY: rm = &acpi_rman_mem; break; default: continue; } /* Pre-allocate resource and add to our rman pool. */ res = BUS_ALLOC_RESOURCE(device_get_parent(dev), dev, rle->type, &rle->rid, rle->start, rle->start + rle->count - 1, rle->count, 0); if (res != NULL) { rman_manage_region(rm, rman_get_start(res), rman_get_end(res)); rle->res = res; } else if (bootverbose) device_printf(dev, "reservation of %jx, %jx (%d) failed\n", rle->start, rle->count, rle->type); } return (0); } static char *pcilink_ids[] = { "PNP0C0F", NULL }; static char *sysres_ids[] = { "PNP0C01", "PNP0C02", NULL }; /* * Reserve declared resources for devices found during attach once system * resources have been allocated. */ static void acpi_reserve_resources(device_t dev) { struct resource_list_entry *rle; struct resource_list *rl; struct acpi_device *ad; struct acpi_softc *sc; device_t *children; int child_count, i; sc = device_get_softc(dev); if (device_get_children(dev, &children, &child_count) != 0) return; for (i = 0; i < child_count; i++) { ad = device_get_ivars(children[i]); rl = &ad->ad_rl; /* Don't reserve system resources. */ if (ACPI_ID_PROBE(dev, children[i], sysres_ids) != NULL) continue; STAILQ_FOREACH(rle, rl, link) { /* * Don't reserve IRQ resources. There are many sticky things * to get right otherwise (e.g. IRQs for psm, atkbd, and HPET * when using legacy routing). */ if (rle->type == SYS_RES_IRQ) continue; /* * Don't reserve the resource if it is already allocated. * The acpi_ec(4) driver can allocate its resources early * if ECDT is present. */ if (rle->res != NULL) continue; /* * Try to reserve the resource from our parent. If this * fails because the resource is a system resource, just * let it be. The resource range is already reserved so * that other devices will not use it. If the driver * needs to allocate the resource, then * acpi_alloc_resource() will sub-alloc from the system * resource. */ resource_list_reserve(rl, dev, children[i], rle->type, &rle->rid, rle->start, rle->end, rle->count, 0); } } free(children, M_TEMP); sc->acpi_resources_reserved = 1; } static int acpi_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct acpi_softc *sc = device_get_softc(dev); struct acpi_device *ad = device_get_ivars(child); struct resource_list *rl = &ad->ad_rl; ACPI_DEVICE_INFO *devinfo; rman_res_t end; /* Ignore IRQ resources for PCI link devices. */ if (type == SYS_RES_IRQ && ACPI_ID_PROBE(dev, child, pcilink_ids) != NULL) return (0); /* * Ignore most resources for PCI root bridges. Some BIOSes * incorrectly enumerate the memory ranges they decode as plain * memory resources instead of as ResourceProducer ranges. Other * BIOSes incorrectly list system resource entries for I/O ranges * under the PCI bridge. Do allow the one known-correct case on * x86 of a PCI bridge claiming the I/O ports used for PCI config * access. */ if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) { if (ACPI_SUCCESS(AcpiGetObjectInfo(ad->ad_handle, &devinfo))) { if ((devinfo->Flags & ACPI_PCI_ROOT_BRIDGE) != 0) { #if defined(__i386__) || defined(__amd64__) if (!(type == SYS_RES_IOPORT && start == CONF1_ADDR_PORT)) #endif { AcpiOsFree(devinfo); return (0); } } AcpiOsFree(devinfo); } } /* If the resource is already allocated, fail. */ if (resource_list_busy(rl, type, rid)) return (EBUSY); /* If the resource is already reserved, release it. */ if (resource_list_reserved(rl, type, rid)) resource_list_unreserve(rl, dev, child, type, rid); /* Add the resource. */ end = (start + count - 1); resource_list_add(rl, type, rid, start, end, count); /* Don't reserve resources until the system resources are allocated. */ if (!sc->acpi_resources_reserved) return (0); /* Don't reserve system resources. */ if (ACPI_ID_PROBE(dev, child, sysres_ids) != NULL) return (0); /* * Don't reserve IRQ resources. There are many sticky things to * get right otherwise (e.g. IRQs for psm, atkbd, and HPET when * using legacy routing). */ if (type == SYS_RES_IRQ) return (0); /* * Reserve the resource. * * XXX: Ignores failure for now. Failure here is probably a * BIOS/firmware bug? */ resource_list_reserve(rl, dev, child, type, &rid, start, end, count, 0); return (0); } static struct resource * acpi_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { ACPI_RESOURCE ares; struct acpi_device *ad; struct resource_list_entry *rle; struct resource_list *rl; struct resource *res; int isdefault = RMAN_IS_DEFAULT_RANGE(start, end); /* * First attempt at allocating the resource. For direct children, * use resource_list_alloc() to handle reserved resources. For * other devices, pass the request up to our parent. */ if (bus == device_get_parent(child)) { ad = device_get_ivars(child); rl = &ad->ad_rl; /* * Simulate the behavior of the ISA bus for direct children * devices. That is, if a non-default range is specified for * a resource that doesn't exist, use bus_set_resource() to * add the resource before allocating it. Note that these * resources will not be reserved. */ if (!isdefault && resource_list_find(rl, type, *rid) == NULL) resource_list_add(rl, type, *rid, start, end, count); res = resource_list_alloc(rl, bus, child, type, rid, start, end, count, flags); if (res != NULL && type == SYS_RES_IRQ) { /* * Since bus_config_intr() takes immediate effect, we cannot * configure the interrupt associated with a device when we * parse the resources but have to defer it until a driver * actually allocates the interrupt via bus_alloc_resource(). * * XXX: Should we handle the lookup failing? */ if (ACPI_SUCCESS(acpi_lookup_irq_resource(child, *rid, res, &ares))) acpi_config_intr(child, &ares); } /* * If this is an allocation of the "default" range for a given * RID, fetch the exact bounds for this resource from the * resource list entry to try to allocate the range from the * system resource regions. */ if (res == NULL && isdefault) { rle = resource_list_find(rl, type, *rid); if (rle != NULL) { start = rle->start; end = rle->end; count = rle->count; } } } else res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * If the first attempt failed and this is an allocation of a * specific range, try to satisfy the request via a suballocation * from our system resource regions. */ if (res == NULL && start + count - 1 == end) res = acpi_alloc_sysres(child, type, rid, start, end, count, flags); return (res); } /* * Attempt to allocate a specific resource range from the system * resource ranges. Note that we only handle memory and I/O port * system resources. */ struct resource * acpi_alloc_sysres(device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct rman *rm; struct resource *res; switch (type) { case SYS_RES_IOPORT: rm = &acpi_rman_io; break; case SYS_RES_MEMORY: rm = &acpi_rman_mem; break; default: return (NULL); } KASSERT(start + count - 1 == end, ("wildcard resource range")); res = rman_reserve_resource(rm, start, end, count, flags & ~RF_ACTIVE, child); if (res == NULL) return (NULL); rman_set_rid(res, *rid); /* If requested, activate the resource using the parent's method. */ if (flags & RF_ACTIVE) if (bus_activate_resource(child, type, *rid, res) != 0) { rman_release_resource(res); return (NULL); } return (res); } static int acpi_is_resource_managed(int type, struct resource *r) { /* We only handle memory and IO resources through rman. */ switch (type) { case SYS_RES_IOPORT: return (rman_is_region_manager(r, &acpi_rman_io)); case SYS_RES_MEMORY: return (rman_is_region_manager(r, &acpi_rman_mem)); } return (0); } static int acpi_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { if (acpi_is_resource_managed(type, r)) return (rman_adjust_resource(r, start, end)); return (bus_generic_adjust_resource(bus, child, type, r, start, end)); } static int acpi_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { int ret; /* * If this resource belongs to one of our internal managers, * deactivate it and release it to the local pool. */ if (acpi_is_resource_managed(type, r)) { if (rman_get_flags(r) & RF_ACTIVE) { ret = bus_deactivate_resource(child, type, rid, r); if (ret != 0) return (ret); } return (rman_release_resource(r)); } return (bus_generic_rl_release_resource(bus, child, type, rid, r)); } static void acpi_delete_resource(device_t bus, device_t child, int type, int rid) { struct resource_list *rl; rl = acpi_get_rlist(bus, child); if (resource_list_busy(rl, type, rid)) { device_printf(bus, "delete_resource: Resource still owned by child" " (type=%d, rid=%d)\n", type, rid); return; } resource_list_unreserve(rl, bus, child, type, rid); resource_list_delete(rl, type, rid); } /* Allocate an IO port or memory resource, given its GAS. */ int acpi_bus_alloc_gas(device_t dev, int *type, int *rid, ACPI_GENERIC_ADDRESS *gas, struct resource **res, u_int flags) { int error, res_type; error = ENOMEM; if (type == NULL || rid == NULL || gas == NULL || res == NULL) return (EINVAL); /* We only support memory and IO spaces. */ switch (gas->SpaceId) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: res_type = SYS_RES_MEMORY; break; case ACPI_ADR_SPACE_SYSTEM_IO: res_type = SYS_RES_IOPORT; break; default: return (EOPNOTSUPP); } /* * If the register width is less than 8, assume the BIOS author means * it is a bit field and just allocate a byte. */ if (gas->BitWidth && gas->BitWidth < 8) gas->BitWidth = 8; /* Validate the address after we're sure we support the space. */ if (gas->Address == 0 || gas->BitWidth == 0) return (EINVAL); bus_set_resource(dev, res_type, *rid, gas->Address, gas->BitWidth / 8); *res = bus_alloc_resource_any(dev, res_type, rid, RF_ACTIVE | flags); if (*res != NULL) { *type = res_type; error = 0; } else bus_delete_resource(dev, res_type, *rid); return (error); } /* Probe _HID and _CID for compatible ISA PNP ids. */ static uint32_t acpi_isa_get_logicalid(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; uint32_t pnpid; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Fetch and validate the HID. */ if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return_VALUE (0); pnpid = (devinfo->Valid & ACPI_VALID_HID) != 0 && devinfo->HardwareId.Length >= ACPI_EISAID_STRING_SIZE ? PNP_EISAID(devinfo->HardwareId.String) : 0; AcpiOsFree(devinfo); return_VALUE (pnpid); } static int acpi_isa_get_compatid(device_t dev, uint32_t *cids, int count) { ACPI_DEVICE_INFO *devinfo; ACPI_PNP_DEVICE_ID *ids; ACPI_HANDLE h; uint32_t *pnpid; int i, valid; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); pnpid = cids; /* Fetch and validate the CID */ if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return_VALUE (0); if ((devinfo->Valid & ACPI_VALID_CID) == 0) { AcpiOsFree(devinfo); return_VALUE (0); } if (devinfo->CompatibleIdList.Count < count) count = devinfo->CompatibleIdList.Count; ids = devinfo->CompatibleIdList.Ids; for (i = 0, valid = 0; i < count; i++) if (ids[i].Length >= ACPI_EISAID_STRING_SIZE && strncmp(ids[i].String, "PNP", 3) == 0) { *pnpid++ = PNP_EISAID(ids[i].String); valid++; } AcpiOsFree(devinfo); return_VALUE (valid); } static char * acpi_device_id_probe(device_t bus, device_t dev, char **ids) { ACPI_HANDLE h; ACPI_OBJECT_TYPE t; int i; h = acpi_get_handle(dev); if (ids == NULL || h == NULL) return (NULL); t = acpi_get_type(dev); if (t != ACPI_TYPE_DEVICE && t != ACPI_TYPE_PROCESSOR) return (NULL); /* Try to match one of the array of IDs with a HID or CID. */ for (i = 0; ids[i] != NULL; i++) { if (acpi_MatchHid(h, ids[i])) return (ids[i]); } return (NULL); } static ACPI_STATUS acpi_device_eval_obj(device_t bus, device_t dev, ACPI_STRING pathname, ACPI_OBJECT_LIST *parameters, ACPI_BUFFER *ret) { ACPI_HANDLE h; if (dev == NULL) h = ACPI_ROOT_OBJECT; else if ((h = acpi_get_handle(dev)) == NULL) return (AE_BAD_PARAMETER); return (AcpiEvaluateObject(h, pathname, parameters, ret)); } int acpi_device_pwr_for_sleep(device_t bus, device_t dev, int *dstate) { struct acpi_softc *sc; ACPI_HANDLE handle; ACPI_STATUS status; char sxd[8]; handle = acpi_get_handle(dev); /* * XXX If we find these devices, don't try to power them down. * The serial and IRDA ports on my T23 hang the system when * set to D3 and it appears that such legacy devices may * need special handling in their drivers. */ if (dstate == NULL || handle == NULL || acpi_MatchHid(handle, "PNP0500") || acpi_MatchHid(handle, "PNP0501") || acpi_MatchHid(handle, "PNP0502") || acpi_MatchHid(handle, "PNP0510") || acpi_MatchHid(handle, "PNP0511")) return (ENXIO); /* * Override next state with the value from _SxD, if present. * Note illegal _S0D is evaluated because some systems expect this. */ sc = device_get_softc(bus); snprintf(sxd, sizeof(sxd), "_S%dD", sc->acpi_sstate); status = acpi_GetInteger(handle, sxd, dstate); if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) { device_printf(dev, "failed to get %s on %s: %s\n", sxd, acpi_name(handle), AcpiFormatException(status)); return (ENXIO); } return (0); } /* Callback arg for our implementation of walking the namespace. */ struct acpi_device_scan_ctx { acpi_scan_cb_t user_fn; void *arg; ACPI_HANDLE parent; }; static ACPI_STATUS acpi_device_scan_cb(ACPI_HANDLE h, UINT32 level, void *arg, void **retval) { struct acpi_device_scan_ctx *ctx; device_t dev, old_dev; ACPI_STATUS status; ACPI_OBJECT_TYPE type; /* * Skip this device if we think we'll have trouble with it or it is * the parent where the scan began. */ ctx = (struct acpi_device_scan_ctx *)arg; if (acpi_avoid(h) || h == ctx->parent) return (AE_OK); /* If this is not a valid device type (e.g., a method), skip it. */ if (ACPI_FAILURE(AcpiGetType(h, &type))) return (AE_OK); if (type != ACPI_TYPE_DEVICE && type != ACPI_TYPE_PROCESSOR && type != ACPI_TYPE_THERMAL && type != ACPI_TYPE_POWER) return (AE_OK); /* * Call the user function with the current device. If it is unchanged * afterwards, return. Otherwise, we update the handle to the new dev. */ old_dev = acpi_get_device(h); dev = old_dev; status = ctx->user_fn(h, &dev, level, ctx->arg); if (ACPI_FAILURE(status) || old_dev == dev) return (status); /* Remove the old child and its connection to the handle. */ if (old_dev != NULL) { device_delete_child(device_get_parent(old_dev), old_dev); AcpiDetachData(h, acpi_fake_objhandler); } /* Recreate the handle association if the user created a device. */ if (dev != NULL) AcpiAttachData(h, acpi_fake_objhandler, dev); return (AE_OK); } static ACPI_STATUS acpi_device_scan_children(device_t bus, device_t dev, int max_depth, acpi_scan_cb_t user_fn, void *arg) { ACPI_HANDLE h; struct acpi_device_scan_ctx ctx; if (acpi_disabled("children")) return (AE_OK); if (dev == NULL) h = ACPI_ROOT_OBJECT; else if ((h = acpi_get_handle(dev)) == NULL) return (AE_BAD_PARAMETER); ctx.user_fn = user_fn; ctx.arg = arg; ctx.parent = h; return (AcpiWalkNamespace(ACPI_TYPE_ANY, h, max_depth, acpi_device_scan_cb, NULL, &ctx, NULL)); } /* * Even though ACPI devices are not PCI, we use the PCI approach for setting * device power states since it's close enough to ACPI. */ static int acpi_set_powerstate(device_t child, int state) { ACPI_HANDLE h; ACPI_STATUS status; h = acpi_get_handle(child); if (state < ACPI_STATE_D0 || state > ACPI_D_STATES_MAX) return (EINVAL); if (h == NULL) return (0); /* Ignore errors if the power methods aren't present. */ status = acpi_pwr_switch_consumer(h, state); if (ACPI_SUCCESS(status)) { if (bootverbose) device_printf(child, "set ACPI power state D%d on %s\n", state, acpi_name(h)); } else if (status != AE_NOT_FOUND) device_printf(child, "failed to set ACPI power state D%d on %s: %s\n", state, acpi_name(h), AcpiFormatException(status)); return (0); } static int acpi_isa_pnp_probe(device_t bus, device_t child, struct isa_pnp_id *ids) { int result, cid_count, i; uint32_t lid, cids[8]; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * ISA-style drivers attached to ACPI may persist and * probe manually if we return ENOENT. We never want * that to happen, so don't ever return it. */ result = ENXIO; /* Scan the supplied IDs for a match */ lid = acpi_isa_get_logicalid(child); cid_count = acpi_isa_get_compatid(child, cids, 8); while (ids && ids->ip_id) { if (lid == ids->ip_id) { result = 0; goto out; } for (i = 0; i < cid_count; i++) { if (cids[i] == ids->ip_id) { result = 0; goto out; } } ids++; } out: if (result == 0 && ids->ip_desc) device_set_desc(child, ids->ip_desc); return_VALUE (result); } #if defined(__i386__) || defined(__amd64__) /* * Look for a MCFG table. If it is present, use the settings for * domain (segment) 0 to setup PCI config space access via the memory * map. */ static void acpi_enable_pcie(void) { ACPI_TABLE_HEADER *hdr; ACPI_MCFG_ALLOCATION *alloc, *end; ACPI_STATUS status; status = AcpiGetTable(ACPI_SIG_MCFG, 1, &hdr); if (ACPI_FAILURE(status)) return; end = (ACPI_MCFG_ALLOCATION *)((char *)hdr + hdr->Length); alloc = (ACPI_MCFG_ALLOCATION *)((ACPI_TABLE_MCFG *)hdr + 1); while (alloc < end) { if (alloc->PciSegment == 0) { pcie_cfgregopen(alloc->Address, alloc->StartBusNumber, alloc->EndBusNumber); return; } alloc++; } } #endif /* * Scan all of the ACPI namespace and attach child devices. * * We should only expect to find devices in the \_PR, \_TZ, \_SI, and * \_SB scopes, and \_PR and \_TZ became obsolete in the ACPI 2.0 spec. * However, in violation of the spec, some systems place their PCI link * devices in \, so we have to walk the whole namespace. We check the * type of namespace nodes, so this should be ok. */ static void acpi_probe_children(device_t bus) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * Scan the namespace and insert placeholders for all the devices that * we find. We also probe/attach any early devices. * * Note that we use AcpiWalkNamespace rather than AcpiGetDevices because * we want to create nodes for all devices, not just those that are * currently present. (This assumes that we don't want to create/remove * devices as they appear, which might be smarter.) */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "namespace scan\n")); AcpiWalkNamespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT, 100, acpi_probe_child, NULL, bus, NULL); /* Pre-allocate resources for our rman from any sysresource devices. */ acpi_sysres_alloc(bus); /* Reserve resources already allocated to children. */ acpi_reserve_resources(bus); /* Create any static children by calling device identify methods. */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "device identify routines\n")); bus_generic_probe(bus); /* Probe/attach all children, created statically and from the namespace. */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "acpi bus_generic_attach\n")); bus_generic_attach(bus); /* Attach wake sysctls. */ acpi_wake_sysctl_walk(bus); ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "done attaching children\n")); return_VOID; } /* * Determine the probe order for a given device. */ static void acpi_probe_order(ACPI_HANDLE handle, int *order) { ACPI_OBJECT_TYPE type; /* * 0. CPUs * 1. I/O port and memory system resource holders * 2. Clocks and timers (to handle early accesses) * 3. Embedded controllers (to handle early accesses) * 4. PCI Link Devices */ AcpiGetType(handle, &type); if (type == ACPI_TYPE_PROCESSOR) *order = 0; else if (acpi_MatchHid(handle, "PNP0C01") || acpi_MatchHid(handle, "PNP0C02")) *order = 1; else if (acpi_MatchHid(handle, "PNP0100") || acpi_MatchHid(handle, "PNP0103") || acpi_MatchHid(handle, "PNP0B00")) *order = 2; else if (acpi_MatchHid(handle, "PNP0C09")) *order = 3; else if (acpi_MatchHid(handle, "PNP0C0F")) *order = 4; } /* * Evaluate a child device and determine whether we might attach a device to * it. */ static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level, void *context, void **status) { struct acpi_prw_data prw; ACPI_OBJECT_TYPE type; ACPI_HANDLE h; device_t bus, child; char *handle_str; int order; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (acpi_disabled("children")) return_ACPI_STATUS (AE_OK); /* Skip this device if we think we'll have trouble with it. */ if (acpi_avoid(handle)) return_ACPI_STATUS (AE_OK); bus = (device_t)context; if (ACPI_SUCCESS(AcpiGetType(handle, &type))) { handle_str = acpi_name(handle); switch (type) { case ACPI_TYPE_DEVICE: /* * Since we scan from \, be sure to skip system scope objects. * \_SB_ and \_TZ_ are defined in ACPICA as devices to work around * BIOS bugs. For example, \_SB_ is to allow \_SB_._INI to be run * during the intialization and \_TZ_ is to support Notify() on it. */ if (strcmp(handle_str, "\\_SB_") == 0 || strcmp(handle_str, "\\_TZ_") == 0) break; if (acpi_parse_prw(handle, &prw) == 0) AcpiSetupGpeForWake(handle, prw.gpe_handle, prw.gpe_bit); /* * Ignore devices that do not have a _HID or _CID. They should * be discovered by other buses (e.g. the PCI bus driver). */ if (!acpi_has_hid(handle)) break; /* FALLTHROUGH */ case ACPI_TYPE_PROCESSOR: case ACPI_TYPE_THERMAL: case ACPI_TYPE_POWER: /* * Create a placeholder device for this node. Sort the * placeholder so that the probe/attach passes will run * breadth-first. Orders less than ACPI_DEV_BASE_ORDER * are reserved for special objects (i.e., system * resources). */ ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "scanning '%s'\n", handle_str)); order = level * 10 + ACPI_DEV_BASE_ORDER; acpi_probe_order(handle, &order); child = BUS_ADD_CHILD(bus, order, NULL, -1); if (child == NULL) break; /* Associate the handle with the device_t and vice versa. */ acpi_set_handle(child, handle); AcpiAttachData(handle, acpi_fake_objhandler, child); /* * Check that the device is present. If it's not present, * leave it disabled (so that we have a device_t attached to * the handle, but we don't probe it). * * XXX PCI link devices sometimes report "present" but not * "functional" (i.e. if disabled). Go ahead and probe them * anyway since we may enable them later. */ if (type == ACPI_TYPE_DEVICE && !acpi_DeviceIsPresent(child)) { /* Never disable PCI link devices. */ if (acpi_MatchHid(handle, "PNP0C0F")) break; /* * Docking stations should remain enabled since the system * may be undocked at boot. */ if (ACPI_SUCCESS(AcpiGetHandle(handle, "_DCK", &h))) break; device_disable(child); break; } /* * Get the device's resource settings and attach them. * Note that if the device has _PRS but no _CRS, we need * to decide when it's appropriate to try to configure the * device. Ignore the return value here; it's OK for the * device not to have any resources. */ acpi_parse_resources(child, handle, &acpi_res_parse_set, NULL); break; } } return_ACPI_STATUS (AE_OK); } /* * AcpiAttachData() requires an object handler but never uses it. This is a * placeholder object handler so we can store a device_t in an ACPI_HANDLE. */ void acpi_fake_objhandler(ACPI_HANDLE h, void *data) { } static void acpi_shutdown_final(void *arg, int howto) { struct acpi_softc *sc = (struct acpi_softc *)arg; register_t intr; ACPI_STATUS status; /* * XXX Shutdown code should only run on the BSP (cpuid 0). * Some chipsets do not power off the system correctly if called from * an AP. */ if ((howto & RB_POWEROFF) != 0) { status = AcpiEnterSleepStatePrep(ACPI_STATE_S5); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n", AcpiFormatException(status)); return; } device_printf(sc->acpi_dev, "Powering system off\n"); intr = intr_disable(); status = AcpiEnterSleepState(ACPI_STATE_S5); if (ACPI_FAILURE(status)) { intr_restore(intr); device_printf(sc->acpi_dev, "power-off failed - %s\n", AcpiFormatException(status)); } else { DELAY(1000000); intr_restore(intr); device_printf(sc->acpi_dev, "power-off failed - timeout\n"); } } else if ((howto & RB_HALT) == 0 && sc->acpi_handle_reboot) { /* Reboot using the reset register. */ status = AcpiReset(); if (ACPI_SUCCESS(status)) { DELAY(1000000); device_printf(sc->acpi_dev, "reset failed - timeout\n"); } else if (status != AE_NOT_EXIST) device_printf(sc->acpi_dev, "reset failed - %s\n", AcpiFormatException(status)); } else if (sc->acpi_do_disable && panicstr == NULL) { /* * Only disable ACPI if the user requested. On some systems, writing * the disable value to SMI_CMD hangs the system. */ device_printf(sc->acpi_dev, "Shutting down\n"); AcpiTerminate(); } } static void acpi_enable_fixed_events(struct acpi_softc *sc) { static int first_time = 1; /* Enable and clear fixed events and install handlers. */ if ((AcpiGbl_FADT.Flags & ACPI_FADT_POWER_BUTTON) == 0) { AcpiClearEvent(ACPI_EVENT_POWER_BUTTON); AcpiInstallFixedEventHandler(ACPI_EVENT_POWER_BUTTON, acpi_event_power_button_sleep, sc); if (first_time) device_printf(sc->acpi_dev, "Power Button (fixed)\n"); } if ((AcpiGbl_FADT.Flags & ACPI_FADT_SLEEP_BUTTON) == 0) { AcpiClearEvent(ACPI_EVENT_SLEEP_BUTTON); AcpiInstallFixedEventHandler(ACPI_EVENT_SLEEP_BUTTON, acpi_event_sleep_button_sleep, sc); if (first_time) device_printf(sc->acpi_dev, "Sleep Button (fixed)\n"); } first_time = 0; } /* * Returns true if the device is actually present and should * be attached to. This requires the present, enabled, UI-visible * and diagnostics-passed bits to be set. */ BOOLEAN acpi_DeviceIsPresent(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; BOOLEAN present; if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); /* If no _STA method, must be present */ present = (devinfo->Valid & ACPI_VALID_STA) == 0 || ACPI_DEVICE_PRESENT(devinfo->CurrentStatus) ? TRUE : FALSE; AcpiOsFree(devinfo); return (present); } /* * Returns true if the battery is actually present and inserted. */ BOOLEAN acpi_BatteryIsPresent(device_t dev) { ACPI_DEVICE_INFO *devinfo; ACPI_HANDLE h; BOOLEAN present; if ((h = acpi_get_handle(dev)) == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); /* If no _STA method, must be present */ present = (devinfo->Valid & ACPI_VALID_STA) == 0 || ACPI_BATTERY_PRESENT(devinfo->CurrentStatus) ? TRUE : FALSE; AcpiOsFree(devinfo); return (present); } /* * Returns true if a device has at least one valid device ID. */ static BOOLEAN acpi_has_hid(ACPI_HANDLE h) { ACPI_DEVICE_INFO *devinfo; BOOLEAN ret; if (h == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); ret = FALSE; if ((devinfo->Valid & ACPI_VALID_HID) != 0) ret = TRUE; else if ((devinfo->Valid & ACPI_VALID_CID) != 0) if (devinfo->CompatibleIdList.Count > 0) ret = TRUE; AcpiOsFree(devinfo); return (ret); } /* * Match a HID string against a handle */ BOOLEAN acpi_MatchHid(ACPI_HANDLE h, const char *hid) { ACPI_DEVICE_INFO *devinfo; BOOLEAN ret; int i; if (hid == NULL || h == NULL || ACPI_FAILURE(AcpiGetObjectInfo(h, &devinfo))) return (FALSE); ret = FALSE; if ((devinfo->Valid & ACPI_VALID_HID) != 0 && strcmp(hid, devinfo->HardwareId.String) == 0) ret = TRUE; else if ((devinfo->Valid & ACPI_VALID_CID) != 0) for (i = 0; i < devinfo->CompatibleIdList.Count; i++) { if (strcmp(hid, devinfo->CompatibleIdList.Ids[i].String) == 0) { ret = TRUE; break; } } AcpiOsFree(devinfo); return (ret); } /* * Return the handle of a named object within our scope, ie. that of (parent) * or one if its parents. */ ACPI_STATUS acpi_GetHandleInScope(ACPI_HANDLE parent, char *path, ACPI_HANDLE *result) { ACPI_HANDLE r; ACPI_STATUS status; /* Walk back up the tree to the root */ for (;;) { status = AcpiGetHandle(parent, path, &r); if (ACPI_SUCCESS(status)) { *result = r; return (AE_OK); } /* XXX Return error here? */ if (status != AE_NOT_FOUND) return (AE_OK); if (ACPI_FAILURE(AcpiGetParent(parent, &r))) return (AE_NOT_FOUND); parent = r; } } /* * Allocate a buffer with a preset data size. */ ACPI_BUFFER * acpi_AllocBuffer(int size) { ACPI_BUFFER *buf; if ((buf = malloc(size + sizeof(*buf), M_ACPIDEV, M_NOWAIT)) == NULL) return (NULL); buf->Length = size; buf->Pointer = (void *)(buf + 1); return (buf); } ACPI_STATUS acpi_SetInteger(ACPI_HANDLE handle, char *path, UINT32 number) { ACPI_OBJECT arg1; ACPI_OBJECT_LIST args; arg1.Type = ACPI_TYPE_INTEGER; arg1.Integer.Value = number; args.Count = 1; args.Pointer = &arg1; return (AcpiEvaluateObject(handle, path, &args, NULL)); } /* * Evaluate a path that should return an integer. */ ACPI_STATUS acpi_GetInteger(ACPI_HANDLE handle, char *path, UINT32 *number) { ACPI_STATUS status; ACPI_BUFFER buf; ACPI_OBJECT param; if (handle == NULL) handle = ACPI_ROOT_OBJECT; /* * Assume that what we've been pointed at is an Integer object, or * a method that will return an Integer. */ buf.Pointer = ¶m; buf.Length = sizeof(param); status = AcpiEvaluateObject(handle, path, NULL, &buf); if (ACPI_SUCCESS(status)) { if (param.Type == ACPI_TYPE_INTEGER) *number = param.Integer.Value; else status = AE_TYPE; } /* * In some applications, a method that's expected to return an Integer * may instead return a Buffer (probably to simplify some internal * arithmetic). We'll try to fetch whatever it is, and if it's a Buffer, * convert it into an Integer as best we can. * * This is a hack. */ if (status == AE_BUFFER_OVERFLOW) { if ((buf.Pointer = AcpiOsAllocate(buf.Length)) == NULL) { status = AE_NO_MEMORY; } else { status = AcpiEvaluateObject(handle, path, NULL, &buf); if (ACPI_SUCCESS(status)) status = acpi_ConvertBufferToInteger(&buf, number); AcpiOsFree(buf.Pointer); } } return (status); } ACPI_STATUS acpi_ConvertBufferToInteger(ACPI_BUFFER *bufp, UINT32 *number) { ACPI_OBJECT *p; UINT8 *val; int i; p = (ACPI_OBJECT *)bufp->Pointer; if (p->Type == ACPI_TYPE_INTEGER) { *number = p->Integer.Value; return (AE_OK); } if (p->Type != ACPI_TYPE_BUFFER) return (AE_TYPE); if (p->Buffer.Length > sizeof(int)) return (AE_BAD_DATA); *number = 0; val = p->Buffer.Pointer; for (i = 0; i < p->Buffer.Length; i++) *number += val[i] << (i * 8); return (AE_OK); } /* * Iterate over the elements of an a package object, calling the supplied * function for each element. * * XXX possible enhancement might be to abort traversal on error. */ ACPI_STATUS acpi_ForeachPackageObject(ACPI_OBJECT *pkg, void (*func)(ACPI_OBJECT *comp, void *arg), void *arg) { ACPI_OBJECT *comp; int i; if (pkg == NULL || pkg->Type != ACPI_TYPE_PACKAGE) return (AE_BAD_PARAMETER); /* Iterate over components */ i = 0; comp = pkg->Package.Elements; for (; i < pkg->Package.Count; i++, comp++) func(comp, arg); return (AE_OK); } /* * Find the (index)th resource object in a set. */ ACPI_STATUS acpi_FindIndexedResource(ACPI_BUFFER *buf, int index, ACPI_RESOURCE **resp) { ACPI_RESOURCE *rp; int i; rp = (ACPI_RESOURCE *)buf->Pointer; i = index; while (i-- > 0) { /* Range check */ if (rp > (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length)) return (AE_BAD_PARAMETER); /* Check for terminator */ if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0) return (AE_NOT_FOUND); rp = ACPI_NEXT_RESOURCE(rp); } if (resp != NULL) *resp = rp; return (AE_OK); } /* * Append an ACPI_RESOURCE to an ACPI_BUFFER. * * Given a pointer to an ACPI_RESOURCE structure, expand the ACPI_BUFFER * provided to contain it. If the ACPI_BUFFER is empty, allocate a sensible * backing block. If the ACPI_RESOURCE is NULL, return an empty set of * resources. */ #define ACPI_INITIAL_RESOURCE_BUFFER_SIZE 512 ACPI_STATUS acpi_AppendBufferResource(ACPI_BUFFER *buf, ACPI_RESOURCE *res) { ACPI_RESOURCE *rp; void *newp; /* Initialise the buffer if necessary. */ if (buf->Pointer == NULL) { buf->Length = ACPI_INITIAL_RESOURCE_BUFFER_SIZE; if ((buf->Pointer = AcpiOsAllocate(buf->Length)) == NULL) return (AE_NO_MEMORY); rp = (ACPI_RESOURCE *)buf->Pointer; rp->Type = ACPI_RESOURCE_TYPE_END_TAG; rp->Length = ACPI_RS_SIZE_MIN; } if (res == NULL) return (AE_OK); /* * Scan the current buffer looking for the terminator. * This will either find the terminator or hit the end * of the buffer and return an error. */ rp = (ACPI_RESOURCE *)buf->Pointer; for (;;) { /* Range check, don't go outside the buffer */ if (rp >= (ACPI_RESOURCE *)((u_int8_t *)buf->Pointer + buf->Length)) return (AE_BAD_PARAMETER); if (rp->Type == ACPI_RESOURCE_TYPE_END_TAG || rp->Length == 0) break; rp = ACPI_NEXT_RESOURCE(rp); } /* * Check the size of the buffer and expand if required. * * Required size is: * size of existing resources before terminator + * size of new resource and header + * size of terminator. * * Note that this loop should really only run once, unless * for some reason we are stuffing a *really* huge resource. */ while ((((u_int8_t *)rp - (u_int8_t *)buf->Pointer) + res->Length + ACPI_RS_SIZE_NO_DATA + ACPI_RS_SIZE_MIN) >= buf->Length) { if ((newp = AcpiOsAllocate(buf->Length * 2)) == NULL) return (AE_NO_MEMORY); bcopy(buf->Pointer, newp, buf->Length); rp = (ACPI_RESOURCE *)((u_int8_t *)newp + ((u_int8_t *)rp - (u_int8_t *)buf->Pointer)); AcpiOsFree(buf->Pointer); buf->Pointer = newp; buf->Length += buf->Length; } /* Insert the new resource. */ bcopy(res, rp, res->Length + ACPI_RS_SIZE_NO_DATA); /* And add the terminator. */ rp = ACPI_NEXT_RESOURCE(rp); rp->Type = ACPI_RESOURCE_TYPE_END_TAG; rp->Length = ACPI_RS_SIZE_MIN; return (AE_OK); } /* * Set interrupt model. */ ACPI_STATUS acpi_SetIntrModel(int model) { return (acpi_SetInteger(ACPI_ROOT_OBJECT, "_PIC", model)); } /* * Walk subtables of a table and call a callback routine for each * subtable. The caller should provide the first subtable and a * pointer to the end of the table. This can be used to walk tables * such as MADT and SRAT that use subtable entries. */ void acpi_walk_subtables(void *first, void *end, acpi_subtable_handler *handler, void *arg) { ACPI_SUBTABLE_HEADER *entry; for (entry = first; (void *)entry < end; ) { /* Avoid an infinite loop if we hit a bogus entry. */ if (entry->Length < sizeof(ACPI_SUBTABLE_HEADER)) return; handler(entry, arg); entry = ACPI_ADD_PTR(ACPI_SUBTABLE_HEADER, entry, entry->Length); } } /* * DEPRECATED. This interface has serious deficiencies and will be * removed. * * Immediately enter the sleep state. In the old model, acpiconf(8) ran * rc.suspend and rc.resume so we don't have to notify devd(8) to do this. */ ACPI_STATUS acpi_SetSleepState(struct acpi_softc *sc, int state) { static int once; if (!once) { device_printf(sc->acpi_dev, "warning: acpi_SetSleepState() deprecated, need to update your software\n"); once = 1; } return (acpi_EnterSleepState(sc, state)); } #if defined(__amd64__) || defined(__i386__) static void acpi_sleep_force_task(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate))) device_printf(sc->acpi_dev, "force sleep state S%d failed\n", sc->acpi_next_sstate); } static void acpi_sleep_force(void *arg) { struct acpi_softc *sc = (struct acpi_softc *)arg; device_printf(sc->acpi_dev, "suspend request timed out, forcing sleep now\n"); /* * XXX Suspending from callout causes freezes in DEVICE_SUSPEND(). * Suspend from acpi_task thread instead. */ if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_sleep_force_task, sc))) device_printf(sc->acpi_dev, "AcpiOsExecute() for sleeping failed\n"); } #endif /* * Request that the system enter the given suspend state. All /dev/apm * devices and devd(8) will be notified. Userland then has a chance to * save state and acknowledge the request. The system sleeps once all * acks are in. */ int acpi_ReqSleepState(struct acpi_softc *sc, int state) { #if defined(__amd64__) || defined(__i386__) struct apm_clone_data *clone; ACPI_STATUS status; if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX) return (EINVAL); if (!acpi_sleep_states[state]) return (EOPNOTSUPP); /* * If a reboot/shutdown/suspend request is already in progress or * suspend is blocked due to an upcoming shutdown, just return. */ if (rebooting || sc->acpi_next_sstate != 0 || suspend_blocked) { return (0); } /* Wait until sleep is enabled. */ while (sc->acpi_sleep_disabled) { AcpiOsSleep(1000); } ACPI_LOCK(acpi); sc->acpi_next_sstate = state; /* S5 (soft-off) should be entered directly with no waiting. */ if (state == ACPI_STATE_S5) { ACPI_UNLOCK(acpi); status = acpi_EnterSleepState(sc, state); return (ACPI_SUCCESS(status) ? 0 : ENXIO); } /* Record the pending state and notify all apm devices. */ STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) { clone->notify_status = APM_EV_NONE; if ((clone->flags & ACPI_EVF_DEVD) == 0) { selwakeuppri(&clone->sel_read, PZERO); KNOTE_LOCKED(&clone->sel_read.si_note, 0); } } /* If devd(8) is not running, immediately enter the sleep state. */ if (!devctl_process_running()) { ACPI_UNLOCK(acpi); status = acpi_EnterSleepState(sc, state); return (ACPI_SUCCESS(status) ? 0 : ENXIO); } /* * Set a timeout to fire if userland doesn't ack the suspend request * in time. This way we still eventually go to sleep if we were * overheating or running low on battery, even if userland is hung. * We cancel this timeout once all userland acks are in or the * suspend request is aborted. */ callout_reset(&sc->susp_force_to, 10 * hz, acpi_sleep_force, sc); ACPI_UNLOCK(acpi); /* Now notify devd(8) also. */ acpi_UserNotify("Suspend", ACPI_ROOT_OBJECT, state); return (0); #else /* This platform does not support acpi suspend/resume. */ return (EOPNOTSUPP); #endif } /* * Acknowledge (or reject) a pending sleep state. The caller has * prepared for suspend and is now ready for it to proceed. If the * error argument is non-zero, it indicates suspend should be cancelled * and gives an errno value describing why. Once all votes are in, * we suspend the system. */ int acpi_AckSleepState(struct apm_clone_data *clone, int error) { #if defined(__amd64__) || defined(__i386__) struct acpi_softc *sc; int ret, sleeping; /* If no pending sleep state, return an error. */ ACPI_LOCK(acpi); sc = clone->acpi_sc; if (sc->acpi_next_sstate == 0) { ACPI_UNLOCK(acpi); return (ENXIO); } /* Caller wants to abort suspend process. */ if (error) { sc->acpi_next_sstate = 0; callout_stop(&sc->susp_force_to); device_printf(sc->acpi_dev, "listener on %s cancelled the pending suspend\n", devtoname(clone->cdev)); ACPI_UNLOCK(acpi); return (0); } /* * Mark this device as acking the suspend request. Then, walk through * all devices, seeing if they agree yet. We only count devices that * are writable since read-only devices couldn't ack the request. */ sleeping = TRUE; clone->notify_status = APM_EV_ACKED; STAILQ_FOREACH(clone, &sc->apm_cdevs, entries) { if ((clone->flags & ACPI_EVF_WRITE) != 0 && clone->notify_status != APM_EV_ACKED) { sleeping = FALSE; break; } } /* If all devices have voted "yes", we will suspend now. */ if (sleeping) callout_stop(&sc->susp_force_to); ACPI_UNLOCK(acpi); ret = 0; if (sleeping) { if (ACPI_FAILURE(acpi_EnterSleepState(sc, sc->acpi_next_sstate))) ret = ENODEV; } return (ret); #else /* This platform does not support acpi suspend/resume. */ return (EOPNOTSUPP); #endif } static void acpi_sleep_enable(void *arg) { struct acpi_softc *sc = (struct acpi_softc *)arg; ACPI_LOCK_ASSERT(acpi); /* Reschedule if the system is not fully up and running. */ if (!AcpiGbl_SystemAwakeAndRunning) { callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME); return; } sc->acpi_sleep_disabled = FALSE; } static ACPI_STATUS acpi_sleep_disable(struct acpi_softc *sc) { ACPI_STATUS status; /* Fail if the system is not fully up and running. */ if (!AcpiGbl_SystemAwakeAndRunning) return (AE_ERROR); ACPI_LOCK(acpi); status = sc->acpi_sleep_disabled ? AE_ERROR : AE_OK; sc->acpi_sleep_disabled = TRUE; ACPI_UNLOCK(acpi); return (status); } enum acpi_sleep_state { ACPI_SS_NONE, ACPI_SS_GPE_SET, ACPI_SS_DEV_SUSPEND, ACPI_SS_SLP_PREP, ACPI_SS_SLEPT, }; /* * Enter the desired system sleep state. * * Currently we support S1-S5 but S4 is only S4BIOS */ static ACPI_STATUS acpi_EnterSleepState(struct acpi_softc *sc, int state) { register_t intr; ACPI_STATUS status; ACPI_EVENT_STATUS power_button_status; enum acpi_sleep_state slp_state; int sleep_result; ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); if (state < ACPI_STATE_S1 || state > ACPI_S_STATES_MAX) return_ACPI_STATUS (AE_BAD_PARAMETER); if (!acpi_sleep_states[state]) { device_printf(sc->acpi_dev, "Sleep state S%d not supported by BIOS\n", state); return (AE_SUPPORT); } /* Re-entry once we're suspending is not allowed. */ status = acpi_sleep_disable(sc); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "suspend request ignored (not ready yet)\n"); return (status); } if (state == ACPI_STATE_S5) { /* * Shut down cleanly and power off. This will call us back through the * shutdown handlers. */ shutdown_nice(RB_POWEROFF); return_ACPI_STATUS (AE_OK); } EVENTHANDLER_INVOKE(power_suspend_early); stop_all_proc(); EVENTHANDLER_INVOKE(power_suspend); if (smp_started) { thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); } /* * Be sure to hold Giant across DEVICE_SUSPEND/RESUME since non-MPSAFE * drivers need this. */ mtx_lock(&Giant); slp_state = ACPI_SS_NONE; sc->acpi_sstate = state; /* Enable any GPEs as appropriate and requested by the user. */ acpi_wake_prep_walk(state); slp_state = ACPI_SS_GPE_SET; /* * Inform all devices that we are going to sleep. If at least one * device fails, DEVICE_SUSPEND() automatically resumes the tree. * * XXX Note that a better two-pass approach with a 'veto' pass * followed by a "real thing" pass would be better, but the current * bus interface does not provide for this. */ if (DEVICE_SUSPEND(root_bus) != 0) { device_printf(sc->acpi_dev, "device_suspend failed\n"); goto backout; } slp_state = ACPI_SS_DEV_SUSPEND; /* If testing device suspend only, back out of everything here. */ if (acpi_susp_bounce) goto backout; status = AcpiEnterSleepStatePrep(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepStatePrep failed - %s\n", AcpiFormatException(status)); goto backout; } slp_state = ACPI_SS_SLP_PREP; if (sc->acpi_sleep_delay > 0) DELAY(sc->acpi_sleep_delay * 1000000); intr = intr_disable(); if (state != ACPI_STATE_S1) { sleep_result = acpi_sleep_machdep(sc, state); acpi_wakeup_machdep(sc, state, sleep_result, 0); /* * XXX According to ACPI specification SCI_EN bit should be restored * by ACPI platform (BIOS, firmware) to its pre-sleep state. * Unfortunately some BIOSes fail to do that and that leads to * unexpected and serious consequences during wake up like a system * getting stuck in SMI handlers. * This hack is picked up from Linux, which claims that it follows * Windows behavior. */ if (sleep_result == 1 && state != ACPI_STATE_S4) AcpiWriteBitRegister(ACPI_BITREG_SCI_ENABLE, ACPI_ENABLE_EVENT); AcpiLeaveSleepStatePrep(state); if (sleep_result == 1 && state == ACPI_STATE_S3) { /* * Prevent mis-interpretation of the wakeup by power button * as a request for power off. * Ideally we should post an appropriate wakeup event, * perhaps using acpi_event_power_button_wake or alike. * * Clearing of power button status after wakeup is mandated * by ACPI specification in section "Fixed Power Button". * * XXX As of ACPICA 20121114 AcpiGetEventStatus provides * status as 0/1 corressponding to inactive/active despite * its type being ACPI_EVENT_STATUS. In other words, * we should not test for ACPI_EVENT_FLAG_SET for time being. */ if (ACPI_SUCCESS(AcpiGetEventStatus(ACPI_EVENT_POWER_BUTTON, &power_button_status)) && power_button_status != 0) { AcpiClearEvent(ACPI_EVENT_POWER_BUTTON); device_printf(sc->acpi_dev, "cleared fixed power button status\n"); } } intr_restore(intr); /* call acpi_wakeup_machdep() again with interrupt enabled */ acpi_wakeup_machdep(sc, state, sleep_result, 1); if (sleep_result == -1) goto backout; /* Re-enable ACPI hardware on wakeup from sleep state 4. */ if (state == ACPI_STATE_S4) AcpiEnable(); } else { status = AcpiEnterSleepState(state); AcpiLeaveSleepStatePrep(state); intr_restore(intr); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); goto backout; } } slp_state = ACPI_SS_SLEPT; /* * Back out state according to how far along we got in the suspend * process. This handles both the error and success cases. */ backout: if (slp_state >= ACPI_SS_GPE_SET) { acpi_wake_prep_walk(state); sc->acpi_sstate = ACPI_STATE_S0; } if (slp_state >= ACPI_SS_DEV_SUSPEND) DEVICE_RESUME(root_bus); if (slp_state >= ACPI_SS_SLP_PREP) AcpiLeaveSleepState(state); if (slp_state >= ACPI_SS_SLEPT) { acpi_resync_clock(sc); acpi_enable_fixed_events(sc); } sc->acpi_next_sstate = 0; mtx_unlock(&Giant); if (smp_started) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } resume_all_proc(); EVENTHANDLER_INVOKE(power_resume); /* Allow another sleep request after a while. */ callout_schedule(&acpi_sleep_timer, hz * ACPI_MINIMUM_AWAKETIME); /* Run /etc/rc.resume after we are back. */ if (devctl_process_running()) acpi_UserNotify("Resume", ACPI_ROOT_OBJECT, state); return_ACPI_STATUS (status); } static void acpi_resync_clock(struct acpi_softc *sc) { #ifdef __amd64__ if (!acpi_reset_clock) return; /* * Warm up timecounter again and reset system clock. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); inittodr(time_second + sc->acpi_sleep_delay); #endif } /* Enable or disable the device's wake GPE. */ int acpi_wake_set_enable(device_t dev, int enable) { struct acpi_prw_data prw; ACPI_STATUS status; int flags; /* Make sure the device supports waking the system and get the GPE. */ if (acpi_parse_prw(acpi_get_handle(dev), &prw) != 0) return (ENXIO); flags = acpi_get_flags(dev); if (enable) { status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE); if (ACPI_FAILURE(status)) { device_printf(dev, "enable wake failed\n"); return (ENXIO); } acpi_set_flags(dev, flags | ACPI_FLAG_WAKE_ENABLED); } else { status = AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE); if (ACPI_FAILURE(status)) { device_printf(dev, "disable wake failed\n"); return (ENXIO); } acpi_set_flags(dev, flags & ~ACPI_FLAG_WAKE_ENABLED); } return (0); } static int acpi_wake_sleep_prep(ACPI_HANDLE handle, int sstate) { struct acpi_prw_data prw; device_t dev; /* Check that this is a wake-capable device and get its GPE. */ if (acpi_parse_prw(handle, &prw) != 0) return (ENXIO); dev = acpi_get_device(handle); /* * The destination sleep state must be less than (i.e., higher power) * or equal to the value specified by _PRW. If this GPE cannot be * enabled for the next sleep state, then disable it. If it can and * the user requested it be enabled, turn on any required power resources * and set _PSW. */ if (sstate > prw.lowest_wake) { AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_DISABLE); if (bootverbose) device_printf(dev, "wake_prep disabled wake for %s (S%d)\n", acpi_name(handle), sstate); } else if (dev && (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) != 0) { acpi_pwr_wake_enable(handle, 1); acpi_SetInteger(handle, "_PSW", 1); if (bootverbose) device_printf(dev, "wake_prep enabled for %s (S%d)\n", acpi_name(handle), sstate); } return (0); } static int acpi_wake_run_prep(ACPI_HANDLE handle, int sstate) { struct acpi_prw_data prw; device_t dev; /* * Check that this is a wake-capable device and get its GPE. Return * now if the user didn't enable this device for wake. */ if (acpi_parse_prw(handle, &prw) != 0) return (ENXIO); dev = acpi_get_device(handle); if (dev == NULL || (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) == 0) return (0); /* * If this GPE couldn't be enabled for the previous sleep state, it was * disabled before going to sleep so re-enable it. If it was enabled, * clear _PSW and turn off any power resources it used. */ if (sstate > prw.lowest_wake) { AcpiSetGpeWakeMask(prw.gpe_handle, prw.gpe_bit, ACPI_GPE_ENABLE); if (bootverbose) device_printf(dev, "run_prep re-enabled %s\n", acpi_name(handle)); } else { acpi_SetInteger(handle, "_PSW", 0); acpi_pwr_wake_enable(handle, 0); if (bootverbose) device_printf(dev, "run_prep cleaned up for %s\n", acpi_name(handle)); } return (0); } static ACPI_STATUS acpi_wake_prep(ACPI_HANDLE handle, UINT32 level, void *context, void **status) { int sstate; /* If suspending, run the sleep prep function, otherwise wake. */ sstate = *(int *)context; if (AcpiGbl_SystemAwakeAndRunning) acpi_wake_sleep_prep(handle, sstate); else acpi_wake_run_prep(handle, sstate); return (AE_OK); } /* Walk the tree rooted at acpi0 to prep devices for suspend/resume. */ static int acpi_wake_prep_walk(int sstate) { ACPI_HANDLE sb_handle; if (ACPI_SUCCESS(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle))) AcpiWalkNamespace(ACPI_TYPE_DEVICE, sb_handle, 100, acpi_wake_prep, NULL, &sstate, NULL); return (0); } /* Walk the tree rooted at acpi0 to attach per-device wake sysctls. */ static int acpi_wake_sysctl_walk(device_t dev) { int error, i, numdevs; device_t *devlist; device_t child; ACPI_STATUS status; error = device_get_children(dev, &devlist, &numdevs); if (error != 0 || numdevs == 0) { if (numdevs == 0) free(devlist, M_TEMP); return (error); } for (i = 0; i < numdevs; i++) { child = devlist[i]; acpi_wake_sysctl_walk(child); if (!device_is_attached(child)) continue; status = AcpiEvaluateObject(acpi_get_handle(child), "_PRW", NULL, NULL); if (ACPI_SUCCESS(status)) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(child), SYSCTL_CHILDREN(device_get_sysctl_tree(child)), OID_AUTO, "wake", CTLTYPE_INT | CTLFLAG_RW, child, 0, acpi_wake_set_sysctl, "I", "Device set to wake the system"); } } free(devlist, M_TEMP); return (0); } /* Enable or disable wake from userland. */ static int acpi_wake_set_sysctl(SYSCTL_HANDLER_ARGS) { int enable, error; device_t dev; dev = (device_t)arg1; enable = (acpi_get_flags(dev) & ACPI_FLAG_WAKE_ENABLED) ? 1 : 0; error = sysctl_handle_int(oidp, &enable, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (enable != 0 && enable != 1) return (EINVAL); return (acpi_wake_set_enable(dev, enable)); } /* Parse a device's _PRW into a structure. */ int acpi_parse_prw(ACPI_HANDLE h, struct acpi_prw_data *prw) { ACPI_STATUS status; ACPI_BUFFER prw_buffer; ACPI_OBJECT *res, *res2; int error, i, power_count; if (h == NULL || prw == NULL) return (EINVAL); /* * The _PRW object (7.2.9) is only required for devices that have the * ability to wake the system from a sleeping state. */ error = EINVAL; prw_buffer.Pointer = NULL; prw_buffer.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(h, "_PRW", NULL, &prw_buffer); if (ACPI_FAILURE(status)) return (ENOENT); res = (ACPI_OBJECT *)prw_buffer.Pointer; if (res == NULL) return (ENOENT); if (!ACPI_PKG_VALID(res, 2)) goto out; /* * Element 1 of the _PRW object: * The lowest power system sleeping state that can be entered while still * providing wake functionality. The sleeping state being entered must * be less than (i.e., higher power) or equal to this value. */ if (acpi_PkgInt32(res, 1, &prw->lowest_wake) != 0) goto out; /* * Element 0 of the _PRW object: */ switch (res->Package.Elements[0].Type) { case ACPI_TYPE_INTEGER: /* * If the data type of this package element is numeric, then this * _PRW package element is the bit index in the GPEx_EN, in the * GPE blocks described in the FADT, of the enable bit that is * enabled for the wake event. */ prw->gpe_handle = NULL; prw->gpe_bit = res->Package.Elements[0].Integer.Value; error = 0; break; case ACPI_TYPE_PACKAGE: /* * If the data type of this package element is a package, then this * _PRW package element is itself a package containing two * elements. The first is an object reference to the GPE Block * device that contains the GPE that will be triggered by the wake * event. The second element is numeric and it contains the bit * index in the GPEx_EN, in the GPE Block referenced by the * first element in the package, of the enable bit that is enabled for * the wake event. * * For example, if this field is a package then it is of the form: * Package() {\_SB.PCI0.ISA.GPE, 2} */ res2 = &res->Package.Elements[0]; if (!ACPI_PKG_VALID(res2, 2)) goto out; prw->gpe_handle = acpi_GetReference(NULL, &res2->Package.Elements[0]); if (prw->gpe_handle == NULL) goto out; if (acpi_PkgInt32(res2, 1, &prw->gpe_bit) != 0) goto out; error = 0; break; default: goto out; } /* Elements 2 to N of the _PRW object are power resources. */ power_count = res->Package.Count - 2; if (power_count > ACPI_PRW_MAX_POWERRES) { printf("ACPI device %s has too many power resources\n", acpi_name(h)); power_count = 0; } prw->power_res_count = power_count; for (i = 0; i < power_count; i++) prw->power_res[i] = res->Package.Elements[i]; out: if (prw_buffer.Pointer != NULL) AcpiOsFree(prw_buffer.Pointer); return (error); } /* * ACPI Event Handlers */ /* System Event Handlers (registered by EVENTHANDLER_REGISTER) */ static void acpi_system_eventhandler_sleep(void *arg, int state) { struct acpi_softc *sc = (struct acpi_softc *)arg; int ret; ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); /* Check if button action is disabled or unknown. */ if (state == ACPI_STATE_UNKNOWN) return; /* Request that the system prepare to enter the given suspend state. */ ret = acpi_ReqSleepState(sc, state); if (ret != 0) device_printf(sc->acpi_dev, "request to enter state S%d failed (err %d)\n", state, ret); return_VOID; } static void acpi_system_eventhandler_wakeup(void *arg, int state) { ACPI_FUNCTION_TRACE_U32((char *)(uintptr_t)__func__, state); /* Currently, nothing to do for wakeup. */ return_VOID; } /* * ACPICA Event Handlers (FixedEvent, also called from button notify handler) */ static void acpi_invoke_sleep_eventhandler(void *context) { EVENTHANDLER_INVOKE(acpi_sleep_event, *(int *)context); } static void acpi_invoke_wake_eventhandler(void *context) { EVENTHANDLER_INVOKE(acpi_wakeup_event, *(int *)context); } UINT32 acpi_event_power_button_sleep(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_sleep_eventhandler, &sc->acpi_power_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_power_button_wake(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_wake_eventhandler, &sc->acpi_power_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_sleep_button_sleep(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_sleep_eventhandler, &sc->acpi_sleep_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } UINT32 acpi_event_sleep_button_wake(void *context) { struct acpi_softc *sc = (struct acpi_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_invoke_wake_eventhandler, &sc->acpi_sleep_button_sx))) return_VALUE (ACPI_INTERRUPT_NOT_HANDLED); return_VALUE (ACPI_INTERRUPT_HANDLED); } /* * XXX This static buffer is suboptimal. There is no locking so only * use this for single-threaded callers. */ char * acpi_name(ACPI_HANDLE handle) { ACPI_BUFFER buf; static char data[256]; buf.Length = sizeof(data); buf.Pointer = data; if (handle && ACPI_SUCCESS(AcpiGetName(handle, ACPI_FULL_PATHNAME, &buf))) return (data); return ("(unknown)"); } /* * Debugging/bug-avoidance. Avoid trying to fetch info on various * parts of the namespace. */ int acpi_avoid(ACPI_HANDLE handle) { char *cp, *env, *np; int len; np = acpi_name(handle); if (*np == '\\') np++; if ((env = kern_getenv("debug.acpi.avoid")) == NULL) return (0); /* Scan the avoid list checking for a match */ cp = env; for (;;) { while (*cp != 0 && isspace(*cp)) cp++; if (*cp == 0) break; len = 0; while (cp[len] != 0 && !isspace(cp[len])) len++; if (!strncmp(cp, np, len)) { freeenv(env); return(1); } cp += len; } freeenv(env); return (0); } /* * Debugging/bug-avoidance. Disable ACPI subsystem components. */ int acpi_disabled(char *subsys) { char *cp, *env; int len; if ((env = kern_getenv("debug.acpi.disabled")) == NULL) return (0); if (strcmp(env, "all") == 0) { freeenv(env); return (1); } /* Scan the disable list, checking for a match. */ cp = env; for (;;) { while (*cp != '\0' && isspace(*cp)) cp++; if (*cp == '\0') break; len = 0; while (cp[len] != '\0' && !isspace(cp[len])) len++; if (strncmp(cp, subsys, len) == 0) { freeenv(env); return (1); } cp += len; } freeenv(env); return (0); } static void acpi_lookup(void *arg, const char *name, device_t *dev) { ACPI_HANDLE handle; if (*dev != NULL) return; /* * Allow any handle name that is specified as an absolute path and * starts with '\'. We could restrict this to \_SB and friends, * but see acpi_probe_children() for notes on why we scan the entire * namespace for devices. * * XXX: The pathname argument to AcpiGetHandle() should be fixed to * be const. */ if (name[0] != '\\') return; if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, __DECONST(char *, name), &handle))) return; *dev = acpi_get_device(handle); } /* * Control interface. * * We multiplex ioctls for all participating ACPI devices here. Individual * drivers wanting to be accessible via /dev/acpi should use the * register/deregister interface to make their handlers visible. */ struct acpi_ioctl_hook { TAILQ_ENTRY(acpi_ioctl_hook) link; u_long cmd; acpi_ioctl_fn fn; void *arg; }; static TAILQ_HEAD(,acpi_ioctl_hook) acpi_ioctl_hooks; static int acpi_ioctl_hooks_initted; int acpi_register_ioctl(u_long cmd, acpi_ioctl_fn fn, void *arg) { struct acpi_ioctl_hook *hp; if ((hp = malloc(sizeof(*hp), M_ACPIDEV, M_NOWAIT)) == NULL) return (ENOMEM); hp->cmd = cmd; hp->fn = fn; hp->arg = arg; ACPI_LOCK(acpi); if (acpi_ioctl_hooks_initted == 0) { TAILQ_INIT(&acpi_ioctl_hooks); acpi_ioctl_hooks_initted = 1; } TAILQ_INSERT_TAIL(&acpi_ioctl_hooks, hp, link); ACPI_UNLOCK(acpi); return (0); } void acpi_deregister_ioctl(u_long cmd, acpi_ioctl_fn fn) { struct acpi_ioctl_hook *hp; ACPI_LOCK(acpi); TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) if (hp->cmd == cmd && hp->fn == fn) break; if (hp != NULL) { TAILQ_REMOVE(&acpi_ioctl_hooks, hp, link); free(hp, M_ACPIDEV); } ACPI_UNLOCK(acpi); } static int acpiopen(struct cdev *dev, int flag, int fmt, struct thread *td) { return (0); } static int acpiclose(struct cdev *dev, int flag, int fmt, struct thread *td) { return (0); } static int acpiioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct acpi_softc *sc; struct acpi_ioctl_hook *hp; int error, state; error = 0; hp = NULL; sc = dev->si_drv1; /* * Scan the list of registered ioctls, looking for handlers. */ ACPI_LOCK(acpi); if (acpi_ioctl_hooks_initted) TAILQ_FOREACH(hp, &acpi_ioctl_hooks, link) { if (hp->cmd == cmd) break; } ACPI_UNLOCK(acpi); if (hp) return (hp->fn(cmd, addr, hp->arg)); /* * Core ioctls are not permitted for non-writable user. * Currently, other ioctls just fetch information. * Not changing system behavior. */ if ((flag & FWRITE) == 0) return (EPERM); /* Core system ioctls. */ switch (cmd) { case ACPIIO_REQSLPSTATE: state = *(int *)addr; if (state != ACPI_STATE_S5) return (acpi_ReqSleepState(sc, state)); device_printf(sc->acpi_dev, "power off via acpi ioctl not supported\n"); error = EOPNOTSUPP; break; case ACPIIO_ACKSLPSTATE: error = *(int *)addr; error = acpi_AckSleepState(sc->acpi_clone, error); break; case ACPIIO_SETSLPSTATE: /* DEPRECATED */ state = *(int *)addr; if (state < ACPI_STATE_S0 || state > ACPI_S_STATES_MAX) return (EINVAL); if (!acpi_sleep_states[state]) return (EOPNOTSUPP); if (ACPI_FAILURE(acpi_SetSleepState(sc, state))) error = ENXIO; break; default: error = ENXIO; break; } return (error); } static int acpi_sname2sstate(const char *sname) { int sstate; if (toupper(sname[0]) == 'S') { sstate = sname[1] - '0'; if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5 && sname[2] == '\0') return (sstate); } else if (strcasecmp(sname, "NONE") == 0) return (ACPI_STATE_UNKNOWN); return (-1); } static const char * acpi_sstate2sname(int sstate) { static const char *snames[] = { "S0", "S1", "S2", "S3", "S4", "S5" }; if (sstate >= ACPI_STATE_S0 && sstate <= ACPI_STATE_S5) return (snames[sstate]); else if (sstate == ACPI_STATE_UNKNOWN) return ("NONE"); return (NULL); } static int acpi_supported_sleep_state_sysctl(SYSCTL_HANDLER_ARGS) { int error; struct sbuf sb; UINT8 state; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (state = ACPI_STATE_S1; state < ACPI_S_STATE_COUNT; state++) if (acpi_sleep_states[state]) sbuf_printf(&sb, "%s ", acpi_sstate2sname(state)); sbuf_trim(&sb); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (error); } static int acpi_sleep_state_sysctl(SYSCTL_HANDLER_ARGS) { char sleep_state[10]; int error, new_state, old_state; old_state = *(int *)oidp->oid_arg1; strlcpy(sleep_state, acpi_sstate2sname(old_state), sizeof(sleep_state)); error = sysctl_handle_string(oidp, sleep_state, sizeof(sleep_state), req); if (error == 0 && req->newptr != NULL) { new_state = acpi_sname2sstate(sleep_state); if (new_state < ACPI_STATE_S1) return (EINVAL); if (new_state < ACPI_S_STATE_COUNT && !acpi_sleep_states[new_state]) return (EOPNOTSUPP); if (new_state != old_state) *(int *)oidp->oid_arg1 = new_state; } return (error); } /* Inform devctl(4) when we receive a Notify. */ void acpi_UserNotify(const char *subsystem, ACPI_HANDLE h, uint8_t notify) { char notify_buf[16]; ACPI_BUFFER handle_buf; ACPI_STATUS status; if (subsystem == NULL) return; handle_buf.Pointer = NULL; handle_buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiNsHandleToPathname(h, &handle_buf, FALSE); if (ACPI_FAILURE(status)) return; snprintf(notify_buf, sizeof(notify_buf), "notify=0x%02x", notify); devctl_notify("ACPI", subsystem, handle_buf.Pointer, notify_buf); AcpiOsFree(handle_buf.Pointer); } #ifdef ACPI_DEBUG /* * Support for parsing debug options from the kernel environment. * * Bits may be set in the AcpiDbgLayer and AcpiDbgLevel debug registers * by specifying the names of the bits in the debug.acpi.layer and * debug.acpi.level environment variables. Bits may be unset by * prefixing the bit name with !. */ struct debugtag { char *name; UINT32 value; }; static struct debugtag dbg_layer[] = { {"ACPI_UTILITIES", ACPI_UTILITIES}, {"ACPI_HARDWARE", ACPI_HARDWARE}, {"ACPI_EVENTS", ACPI_EVENTS}, {"ACPI_TABLES", ACPI_TABLES}, {"ACPI_NAMESPACE", ACPI_NAMESPACE}, {"ACPI_PARSER", ACPI_PARSER}, {"ACPI_DISPATCHER", ACPI_DISPATCHER}, {"ACPI_EXECUTER", ACPI_EXECUTER}, {"ACPI_RESOURCES", ACPI_RESOURCES}, {"ACPI_CA_DEBUGGER", ACPI_CA_DEBUGGER}, {"ACPI_OS_SERVICES", ACPI_OS_SERVICES}, {"ACPI_CA_DISASSEMBLER", ACPI_CA_DISASSEMBLER}, {"ACPI_ALL_COMPONENTS", ACPI_ALL_COMPONENTS}, {"ACPI_AC_ADAPTER", ACPI_AC_ADAPTER}, {"ACPI_BATTERY", ACPI_BATTERY}, {"ACPI_BUS", ACPI_BUS}, {"ACPI_BUTTON", ACPI_BUTTON}, {"ACPI_EC", ACPI_EC}, {"ACPI_FAN", ACPI_FAN}, {"ACPI_POWERRES", ACPI_POWERRES}, {"ACPI_PROCESSOR", ACPI_PROCESSOR}, {"ACPI_THERMAL", ACPI_THERMAL}, {"ACPI_TIMER", ACPI_TIMER}, {"ACPI_ALL_DRIVERS", ACPI_ALL_DRIVERS}, {NULL, 0} }; static struct debugtag dbg_level[] = { {"ACPI_LV_INIT", ACPI_LV_INIT}, {"ACPI_LV_DEBUG_OBJECT", ACPI_LV_DEBUG_OBJECT}, {"ACPI_LV_INFO", ACPI_LV_INFO}, {"ACPI_LV_REPAIR", ACPI_LV_REPAIR}, {"ACPI_LV_ALL_EXCEPTIONS", ACPI_LV_ALL_EXCEPTIONS}, /* Trace verbosity level 1 [Standard Trace Level] */ {"ACPI_LV_INIT_NAMES", ACPI_LV_INIT_NAMES}, {"ACPI_LV_PARSE", ACPI_LV_PARSE}, {"ACPI_LV_LOAD", ACPI_LV_LOAD}, {"ACPI_LV_DISPATCH", ACPI_LV_DISPATCH}, {"ACPI_LV_EXEC", ACPI_LV_EXEC}, {"ACPI_LV_NAMES", ACPI_LV_NAMES}, {"ACPI_LV_OPREGION", ACPI_LV_OPREGION}, {"ACPI_LV_BFIELD", ACPI_LV_BFIELD}, {"ACPI_LV_TABLES", ACPI_LV_TABLES}, {"ACPI_LV_VALUES", ACPI_LV_VALUES}, {"ACPI_LV_OBJECTS", ACPI_LV_OBJECTS}, {"ACPI_LV_RESOURCES", ACPI_LV_RESOURCES}, {"ACPI_LV_USER_REQUESTS", ACPI_LV_USER_REQUESTS}, {"ACPI_LV_PACKAGE", ACPI_LV_PACKAGE}, {"ACPI_LV_VERBOSITY1", ACPI_LV_VERBOSITY1}, /* Trace verbosity level 2 [Function tracing and memory allocation] */ {"ACPI_LV_ALLOCATIONS", ACPI_LV_ALLOCATIONS}, {"ACPI_LV_FUNCTIONS", ACPI_LV_FUNCTIONS}, {"ACPI_LV_OPTIMIZATIONS", ACPI_LV_OPTIMIZATIONS}, {"ACPI_LV_VERBOSITY2", ACPI_LV_VERBOSITY2}, {"ACPI_LV_ALL", ACPI_LV_ALL}, /* Trace verbosity level 3 [Threading, I/O, and Interrupts] */ {"ACPI_LV_MUTEX", ACPI_LV_MUTEX}, {"ACPI_LV_THREADS", ACPI_LV_THREADS}, {"ACPI_LV_IO", ACPI_LV_IO}, {"ACPI_LV_INTERRUPTS", ACPI_LV_INTERRUPTS}, {"ACPI_LV_VERBOSITY3", ACPI_LV_VERBOSITY3}, /* Exceptionally verbose output -- also used in the global "DebugLevel" */ {"ACPI_LV_AML_DISASSEMBLE", ACPI_LV_AML_DISASSEMBLE}, {"ACPI_LV_VERBOSE_INFO", ACPI_LV_VERBOSE_INFO}, {"ACPI_LV_FULL_TABLES", ACPI_LV_FULL_TABLES}, {"ACPI_LV_EVENTS", ACPI_LV_EVENTS}, {"ACPI_LV_VERBOSE", ACPI_LV_VERBOSE}, {NULL, 0} }; static void acpi_parse_debug(char *cp, struct debugtag *tag, UINT32 *flag) { char *ep; int i, l; int set; while (*cp) { if (isspace(*cp)) { cp++; continue; } ep = cp; while (*ep && !isspace(*ep)) ep++; if (*cp == '!') { set = 0; cp++; if (cp == ep) continue; } else { set = 1; } l = ep - cp; for (i = 0; tag[i].name != NULL; i++) { if (!strncmp(cp, tag[i].name, l)) { if (set) *flag |= tag[i].value; else *flag &= ~tag[i].value; } } cp = ep; } } static void acpi_set_debugging(void *junk) { char *layer, *level; if (cold) { AcpiDbgLayer = 0; AcpiDbgLevel = 0; } layer = kern_getenv("debug.acpi.layer"); level = kern_getenv("debug.acpi.level"); if (layer == NULL && level == NULL) return; printf("ACPI set debug"); if (layer != NULL) { if (strcmp("NONE", layer) != 0) printf(" layer '%s'", layer); acpi_parse_debug(layer, &dbg_layer[0], &AcpiDbgLayer); freeenv(layer); } if (level != NULL) { if (strcmp("NONE", level) != 0) printf(" level '%s'", level); acpi_parse_debug(level, &dbg_level[0], &AcpiDbgLevel); freeenv(level); } printf("\n"); } SYSINIT(acpi_debugging, SI_SUB_TUNABLES, SI_ORDER_ANY, acpi_set_debugging, NULL); static int acpi_debug_sysctl(SYSCTL_HANDLER_ARGS) { int error, *dbg; struct debugtag *tag; struct sbuf sb; char temp[128]; if (sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND) == NULL) return (ENOMEM); if (strcmp(oidp->oid_arg1, "debug.acpi.layer") == 0) { tag = &dbg_layer[0]; dbg = &AcpiDbgLayer; } else { tag = &dbg_level[0]; dbg = &AcpiDbgLevel; } /* Get old values if this is a get request. */ ACPI_SERIAL_BEGIN(acpi); if (*dbg == 0) { sbuf_cpy(&sb, "NONE"); } else if (req->newptr == NULL) { for (; tag->name != NULL; tag++) { if ((*dbg & tag->value) == tag->value) sbuf_printf(&sb, "%s ", tag->name); } } sbuf_trim(&sb); sbuf_finish(&sb); strlcpy(temp, sbuf_data(&sb), sizeof(temp)); sbuf_delete(&sb); error = sysctl_handle_string(oidp, temp, sizeof(temp), req); /* Check for error or no change */ if (error == 0 && req->newptr != NULL) { *dbg = 0; kern_setenv((char *)oidp->oid_arg1, temp); acpi_set_debugging(NULL); } ACPI_SERIAL_END(acpi); return (error); } SYSCTL_PROC(_debug_acpi, OID_AUTO, layer, CTLFLAG_RW | CTLTYPE_STRING, "debug.acpi.layer", 0, acpi_debug_sysctl, "A", ""); SYSCTL_PROC(_debug_acpi, OID_AUTO, level, CTLFLAG_RW | CTLTYPE_STRING, "debug.acpi.level", 0, acpi_debug_sysctl, "A", ""); #endif /* ACPI_DEBUG */ static int acpi_debug_objects_sysctl(SYSCTL_HANDLER_ARGS) { int error; int old; old = acpi_debug_objects; error = sysctl_handle_int(oidp, &acpi_debug_objects, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (old == acpi_debug_objects || (old && acpi_debug_objects)) return (0); ACPI_SERIAL_BEGIN(acpi); AcpiGbl_EnableAmlDebugObject = acpi_debug_objects ? TRUE : FALSE; ACPI_SERIAL_END(acpi); return (0); } static int acpi_parse_interfaces(char *str, struct acpi_interface *iface) { char *p; size_t len; int i, j; p = str; while (isspace(*p) || *p == ',') p++; len = strlen(p); if (len == 0) return (0); p = strdup(p, M_TEMP); for (i = 0; i < len; i++) if (p[i] == ',') p[i] = '\0'; i = j = 0; while (i < len) if (isspace(p[i]) || p[i] == '\0') i++; else { i += strlen(p + i) + 1; j++; } if (j == 0) { free(p, M_TEMP); return (0); } iface->data = malloc(sizeof(*iface->data) * j, M_TEMP, M_WAITOK); iface->num = j; i = j = 0; while (i < len) if (isspace(p[i]) || p[i] == '\0') i++; else { iface->data[j] = p + i; i += strlen(p + i) + 1; j++; } return (j); } static void acpi_free_interfaces(struct acpi_interface *iface) { free(iface->data[0], M_TEMP); free(iface->data, M_TEMP); } static void acpi_reset_interfaces(device_t dev) { struct acpi_interface list; ACPI_STATUS status; int i; if (acpi_parse_interfaces(acpi_install_interface, &list) > 0) { for (i = 0; i < list.num; i++) { status = AcpiInstallInterface(list.data[i]); if (ACPI_FAILURE(status)) device_printf(dev, "failed to install _OSI(\"%s\"): %s\n", list.data[i], AcpiFormatException(status)); else if (bootverbose) device_printf(dev, "installed _OSI(\"%s\")\n", list.data[i]); } acpi_free_interfaces(&list); } if (acpi_parse_interfaces(acpi_remove_interface, &list) > 0) { for (i = 0; i < list.num; i++) { status = AcpiRemoveInterface(list.data[i]); if (ACPI_FAILURE(status)) device_printf(dev, "failed to remove _OSI(\"%s\"): %s\n", list.data[i], AcpiFormatException(status)); else if (bootverbose) device_printf(dev, "removed _OSI(\"%s\")\n", list.data[i]); } acpi_free_interfaces(&list); } } static int acpi_pm_func(u_long cmd, void *arg, ...) { int state, acpi_state; int error; struct acpi_softc *sc; va_list ap; error = 0; switch (cmd) { case POWER_CMD_SUSPEND: sc = (struct acpi_softc *)arg; if (sc == NULL) { error = EINVAL; goto out; } va_start(ap, arg); state = va_arg(ap, int); va_end(ap); switch (state) { case POWER_SLEEP_STATE_STANDBY: acpi_state = sc->acpi_standby_sx; break; case POWER_SLEEP_STATE_SUSPEND: acpi_state = sc->acpi_suspend_sx; break; case POWER_SLEEP_STATE_HIBERNATE: acpi_state = ACPI_STATE_S4; break; default: error = EINVAL; goto out; } if (ACPI_FAILURE(acpi_EnterSleepState(sc, acpi_state))) error = ENXIO; break; default: error = EINVAL; goto out; } out: return (error); } static void acpi_pm_register(void *arg) { if (!cold || resource_disabled("acpi", 0)) return; power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, NULL); } SYSINIT(power, SI_SUB_KLD, SI_ORDER_ANY, acpi_pm_register, 0); Index: head/sys/dev/acpica/acpivar.h =================================================================== --- head/sys/dev/acpica/acpivar.h (revision 297747) +++ head/sys/dev/acpica/acpivar.h (revision 297748) @@ -1,512 +1,510 @@ /*- * Copyright (c) 2000 Mitsuru IWASAKI * Copyright (c) 2000 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _ACPIVAR_H_ #define _ACPIVAR_H_ #ifdef _KERNEL #include "acpi_if.h" #include "bus_if.h" #include #include #include #include #include #include #include #include #include struct apm_clone_data; struct acpi_softc { device_t acpi_dev; struct cdev *acpi_dev_t; int acpi_enabled; int acpi_sstate; int acpi_sleep_disabled; int acpi_resources_reserved; struct sysctl_ctx_list acpi_sysctl_ctx; struct sysctl_oid *acpi_sysctl_tree; int acpi_power_button_sx; int acpi_sleep_button_sx; int acpi_lid_switch_sx; int acpi_standby_sx; int acpi_suspend_sx; int acpi_sleep_delay; int acpi_s4bios; int acpi_do_disable; int acpi_verbose; int acpi_handle_reboot; vm_offset_t acpi_wakeaddr; vm_paddr_t acpi_wakephys; int acpi_next_sstate; /* Next suspend Sx state. */ struct apm_clone_data *acpi_clone; /* Pseudo-dev for devd(8). */ STAILQ_HEAD(,apm_clone_data) apm_cdevs; /* All apm/apmctl/acpi cdevs. */ struct callout susp_force_to; /* Force suspend if no acks. */ }; struct acpi_device { /* ACPI ivars */ ACPI_HANDLE ad_handle; void *ad_private; int ad_flags; /* Resources */ struct resource_list ad_rl; }; /* Track device (/dev/{apm,apmctl} and /dev/acpi) notification status. */ struct apm_clone_data { STAILQ_ENTRY(apm_clone_data) entries; struct cdev *cdev; int flags; #define ACPI_EVF_NONE 0 /* /dev/apm semantics */ #define ACPI_EVF_DEVD 1 /* /dev/acpi is handled via devd(8) */ #define ACPI_EVF_WRITE 2 /* Device instance is opened writable. */ int notify_status; #define APM_EV_NONE 0 /* Device not yet aware of pending sleep. */ #define APM_EV_NOTIFIED 1 /* Device saw next sleep state. */ #define APM_EV_ACKED 2 /* Device agreed sleep can occur. */ struct acpi_softc *acpi_sc; struct selinfo sel_read; }; #define ACPI_PRW_MAX_POWERRES 8 struct acpi_prw_data { ACPI_HANDLE gpe_handle; int gpe_bit; int lowest_wake; ACPI_OBJECT power_res[ACPI_PRW_MAX_POWERRES]; int power_res_count; }; /* Flags for each device defined in the AML namespace. */ #define ACPI_FLAG_WAKE_ENABLED 0x1 /* Macros for extracting parts of a PCI address from an _ADR value. */ #define ACPI_ADR_PCI_SLOT(adr) (((adr) & 0xffff0000) >> 16) #define ACPI_ADR_PCI_FUNC(adr) ((adr) & 0xffff) /* * Entry points to ACPI from above are global functions defined in this * file, sysctls, and I/O on the control device. Entry points from below * are interrupts (the SCI), notifies, task queue threads, and the thermal * zone polling thread. * * ACPI tables and global shared data are protected by a global lock * (acpi_mutex). * * Each ACPI device can have its own driver-specific mutex for protecting * shared access to local data. The ACPI_LOCK macros handle mutexes. * * Drivers that need to serialize access to functions (e.g., to route * interrupts, get/set control paths, etc.) should use the sx lock macros * (ACPI_SERIAL). * * ACPI-CA handles its own locking and should not be called with locks held. * * The most complicated path is: * GPE -> EC runs _Qxx -> _Qxx reads EC space -> GPE */ extern struct mtx acpi_mutex; #define ACPI_LOCK(sys) mtx_lock(&sys##_mutex) #define ACPI_UNLOCK(sys) mtx_unlock(&sys##_mutex) #define ACPI_LOCK_ASSERT(sys) mtx_assert(&sys##_mutex, MA_OWNED); #define ACPI_LOCK_DECL(sys, name) \ static struct mtx sys##_mutex; \ MTX_SYSINIT(sys##_mutex, &sys##_mutex, name, MTX_DEF) #define ACPI_SERIAL_BEGIN(sys) sx_xlock(&sys##_sxlock) #define ACPI_SERIAL_END(sys) sx_xunlock(&sys##_sxlock) #define ACPI_SERIAL_ASSERT(sys) sx_assert(&sys##_sxlock, SX_XLOCKED); #define ACPI_SERIAL_DECL(sys, name) \ static struct sx sys##_sxlock; \ SX_SYSINIT(sys##_sxlock, &sys##_sxlock, name) /* * ACPI CA does not define layers for non-ACPI CA drivers. * We define some here within the range provided. */ #define ACPI_AC_ADAPTER 0x00010000 #define ACPI_BATTERY 0x00020000 #define ACPI_BUS 0x00040000 #define ACPI_BUTTON 0x00080000 #define ACPI_EC 0x00100000 #define ACPI_FAN 0x00200000 #define ACPI_POWERRES 0x00400000 #define ACPI_PROCESSOR 0x00800000 #define ACPI_THERMAL 0x01000000 #define ACPI_TIMER 0x02000000 #define ACPI_OEM 0x04000000 /* * Constants for different interrupt models used with acpi_SetIntrModel(). */ #define ACPI_INTR_PIC 0 #define ACPI_INTR_APIC 1 #define ACPI_INTR_SAPIC 2 /* * Various features and capabilities for the acpi_get_features() method. * In particular, these are used for the ACPI 3.0 _PDC and _OSC methods. * See the Intel document titled "Intel Processor Vendor-Specific ACPI", * number 302223-007. */ #define ACPI_CAP_PERF_MSRS (1 << 0) /* Intel SpeedStep PERF_CTL MSRs */ #define ACPI_CAP_C1_IO_HALT (1 << 1) /* Intel C1 "IO then halt" sequence */ #define ACPI_CAP_THR_MSRS (1 << 2) /* Intel OnDemand throttling MSRs */ #define ACPI_CAP_SMP_SAME (1 << 3) /* MP C1, Px, and Tx (all the same) */ #define ACPI_CAP_SMP_SAME_C3 (1 << 4) /* MP C2 and C3 (all the same) */ #define ACPI_CAP_SMP_DIFF_PX (1 << 5) /* MP Px (different, using _PSD) */ #define ACPI_CAP_SMP_DIFF_CX (1 << 6) /* MP Cx (different, using _CSD) */ #define ACPI_CAP_SMP_DIFF_TX (1 << 7) /* MP Tx (different, using _TSD) */ #define ACPI_CAP_SMP_C1_NATIVE (1 << 8) /* MP C1 support other than halt */ #define ACPI_CAP_SMP_C3_NATIVE (1 << 9) /* MP C2 and C3 support */ #define ACPI_CAP_PX_HW_COORD (1 << 11) /* Intel P-state HW coordination */ #define ACPI_CAP_INTR_CPPC (1 << 12) /* Native Interrupt Handling for Collaborative Processor Performance Control notifications */ #define ACPI_CAP_HW_DUTY_C (1 << 13) /* Hardware Duty Cycling */ /* * Quirk flags. * * ACPI_Q_BROKEN: Disables all ACPI support. * ACPI_Q_TIMER: Disables support for the ACPI timer. * ACPI_Q_MADT_IRQ0: Specifies that ISA IRQ 0 is wired up to pin 0 of the * first APIC and that the MADT should force that by ignoring the PC-AT * compatible flag and ignoring overrides that redirect IRQ 0 to pin 2. */ extern int acpi_quirks; #define ACPI_Q_OK 0 #define ACPI_Q_BROKEN (1 << 0) #define ACPI_Q_TIMER (1 << 1) #define ACPI_Q_MADT_IRQ0 (1 << 2) /* * Note that the low ivar values are reserved to provide * interface compatibility with ISA drivers which can also * attach to ACPI. */ #define ACPI_IVAR_HANDLE 0x100 #define ACPI_IVAR_UNUSED 0x101 /* Unused/reserved. */ #define ACPI_IVAR_PRIVATE 0x102 #define ACPI_IVAR_FLAGS 0x103 /* * Accessor functions for our ivars. Default value for BUS_READ_IVAR is * (type) 0. The accessor functions don't check return values. */ #define __ACPI_BUS_ACCESSOR(varp, var, ivarp, ivar, type) \ \ static __inline type varp ## _get_ ## var(device_t dev) \ { \ uintptr_t v = 0; \ BUS_READ_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, &v); \ return ((type) v); \ } \ \ static __inline void varp ## _set_ ## var(device_t dev, type t) \ { \ uintptr_t v = (uintptr_t) t; \ BUS_WRITE_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, v); \ } __ACPI_BUS_ACCESSOR(acpi, handle, ACPI, HANDLE, ACPI_HANDLE) __ACPI_BUS_ACCESSOR(acpi, private, ACPI, PRIVATE, void *) __ACPI_BUS_ACCESSOR(acpi, flags, ACPI, FLAGS, int) void acpi_fake_objhandler(ACPI_HANDLE h, void *data); static __inline device_t acpi_get_device(ACPI_HANDLE handle) { void *dev = NULL; AcpiGetData(handle, acpi_fake_objhandler, &dev); return ((device_t)dev); } static __inline ACPI_OBJECT_TYPE acpi_get_type(device_t dev) { ACPI_HANDLE h; ACPI_OBJECT_TYPE t; if ((h = acpi_get_handle(dev)) == NULL) return (ACPI_TYPE_NOT_FOUND); if (ACPI_FAILURE(AcpiGetType(h, &t))) return (ACPI_TYPE_NOT_FOUND); return (t); } /* Find the difference between two PM tick counts. */ static __inline uint32_t acpi_TimerDelta(uint32_t end, uint32_t start) { if (end < start && (AcpiGbl_FADT.Flags & ACPI_FADT_32BIT_TIMER) == 0) end |= 0x01000000; return (end - start); } #ifdef ACPI_DEBUGGER void acpi_EnterDebugger(void); #endif #ifdef ACPI_DEBUG #include #define STEP(x) do {printf x, printf("\n"); cngetc();} while (0) #else #define STEP(x) #endif #define ACPI_VPRINT(dev, acpi_sc, x...) do { \ if (acpi_get_verbose(acpi_sc)) \ device_printf(dev, x); \ } while (0) /* Values for the device _STA (status) method. */ #define ACPI_STA_PRESENT (1 << 0) #define ACPI_STA_ENABLED (1 << 1) #define ACPI_STA_SHOW_IN_UI (1 << 2) #define ACPI_STA_FUNCTIONAL (1 << 3) #define ACPI_STA_BATT_PRESENT (1 << 4) #define ACPI_DEVINFO_PRESENT(x, flags) \ (((x) & (flags)) == (flags)) #define ACPI_DEVICE_PRESENT(x) \ ACPI_DEVINFO_PRESENT(x, ACPI_STA_PRESENT | ACPI_STA_FUNCTIONAL) #define ACPI_BATTERY_PRESENT(x) \ ACPI_DEVINFO_PRESENT(x, ACPI_STA_PRESENT | ACPI_STA_FUNCTIONAL | \ ACPI_STA_BATT_PRESENT) /* Callback function type for walking subtables within a table. */ typedef void acpi_subtable_handler(ACPI_SUBTABLE_HEADER *, void *); BOOLEAN acpi_DeviceIsPresent(device_t dev); BOOLEAN acpi_BatteryIsPresent(device_t dev); ACPI_STATUS acpi_GetHandleInScope(ACPI_HANDLE parent, char *path, ACPI_HANDLE *result); ACPI_BUFFER *acpi_AllocBuffer(int size); ACPI_STATUS acpi_ConvertBufferToInteger(ACPI_BUFFER *bufp, UINT32 *number); ACPI_STATUS acpi_GetInteger(ACPI_HANDLE handle, char *path, UINT32 *number); ACPI_STATUS acpi_SetInteger(ACPI_HANDLE handle, char *path, UINT32 number); ACPI_STATUS acpi_ForeachPackageObject(ACPI_OBJECT *obj, void (*func)(ACPI_OBJECT *comp, void *arg), void *arg); ACPI_STATUS acpi_FindIndexedResource(ACPI_BUFFER *buf, int index, ACPI_RESOURCE **resp); ACPI_STATUS acpi_AppendBufferResource(ACPI_BUFFER *buf, ACPI_RESOURCE *res); ACPI_STATUS acpi_OverrideInterruptLevel(UINT32 InterruptNumber); ACPI_STATUS acpi_SetIntrModel(int model); int acpi_ReqSleepState(struct acpi_softc *sc, int state); int acpi_AckSleepState(struct apm_clone_data *clone, int error); ACPI_STATUS acpi_SetSleepState(struct acpi_softc *sc, int state); int acpi_wake_set_enable(device_t dev, int enable); int acpi_parse_prw(ACPI_HANDLE h, struct acpi_prw_data *prw); ACPI_STATUS acpi_Startup(void); void acpi_UserNotify(const char *subsystem, ACPI_HANDLE h, uint8_t notify); int acpi_bus_alloc_gas(device_t dev, int *type, int *rid, ACPI_GENERIC_ADDRESS *gas, struct resource **res, u_int flags); void acpi_walk_subtables(void *first, void *end, acpi_subtable_handler *handler, void *arg); BOOLEAN acpi_MatchHid(ACPI_HANDLE h, const char *hid); struct acpi_parse_resource_set { void (*set_init)(device_t dev, void *arg, void **context); void (*set_done)(device_t dev, void *context); void (*set_ioport)(device_t dev, void *context, uint64_t base, uint64_t length); void (*set_iorange)(device_t dev, void *context, uint64_t low, uint64_t high, uint64_t length, uint64_t align); void (*set_memory)(device_t dev, void *context, uint64_t base, uint64_t length); void (*set_memoryrange)(device_t dev, void *context, uint64_t low, uint64_t high, uint64_t length, uint64_t align); void (*set_irq)(device_t dev, void *context, uint8_t *irq, int count, int trig, int pol); void (*set_ext_irq)(device_t dev, void *context, uint32_t *irq, int count, int trig, int pol); void (*set_drq)(device_t dev, void *context, uint8_t *drq, int count); void (*set_start_dependent)(device_t dev, void *context, int preference); void (*set_end_dependent)(device_t dev, void *context); }; extern struct acpi_parse_resource_set acpi_res_parse_set; int acpi_identify(void); void acpi_config_intr(device_t dev, ACPI_RESOURCE *res); ACPI_STATUS acpi_lookup_irq_resource(device_t dev, int rid, struct resource *res, ACPI_RESOURCE *acpi_res); ACPI_STATUS acpi_parse_resources(device_t dev, ACPI_HANDLE handle, struct acpi_parse_resource_set *set, void *arg); struct resource *acpi_alloc_sysres(device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); /* ACPI event handling */ UINT32 acpi_event_power_button_sleep(void *context); UINT32 acpi_event_power_button_wake(void *context); UINT32 acpi_event_sleep_button_sleep(void *context); UINT32 acpi_event_sleep_button_wake(void *context); #define ACPI_EVENT_PRI_FIRST 0 #define ACPI_EVENT_PRI_DEFAULT 10000 #define ACPI_EVENT_PRI_LAST 20000 typedef void (*acpi_event_handler_t)(void *, int); EVENTHANDLER_DECLARE(acpi_sleep_event, acpi_event_handler_t); EVENTHANDLER_DECLARE(acpi_wakeup_event, acpi_event_handler_t); /* Device power control. */ ACPI_STATUS acpi_pwr_wake_enable(ACPI_HANDLE consumer, int enable); ACPI_STATUS acpi_pwr_switch_consumer(ACPI_HANDLE consumer, int state); int acpi_device_pwr_for_sleep(device_t bus, device_t dev, int *dstate); /* APM emulation */ void acpi_apm_init(struct acpi_softc *); /* Misc. */ static __inline struct acpi_softc * acpi_device_get_parent_softc(device_t child) { device_t parent; parent = device_get_parent(child); if (parent == NULL) return (NULL); return (device_get_softc(parent)); } static __inline int acpi_get_verbose(struct acpi_softc *sc) { if (sc) return (sc->acpi_verbose); return (0); } char *acpi_name(ACPI_HANDLE handle); int acpi_avoid(ACPI_HANDLE handle); int acpi_disabled(char *subsys); int acpi_machdep_init(device_t dev); void acpi_install_wakeup_handler(struct acpi_softc *sc); int acpi_sleep_machdep(struct acpi_softc *sc, int state); int acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result, int intr_enabled); int acpi_table_quirks(int *quirks); int acpi_machdep_quirks(int *quirks); uint32_t hpet_get_uid(device_t dev); /* Battery Abstraction. */ struct acpi_battinfo; int acpi_battery_register(device_t dev); int acpi_battery_remove(device_t dev); int acpi_battery_get_units(void); int acpi_battery_get_info_expire(void); int acpi_battery_bst_valid(struct acpi_bst *bst); int acpi_battery_bif_valid(struct acpi_bif *bif); int acpi_battery_get_battinfo(device_t dev, struct acpi_battinfo *info); /* Embedded controller. */ void acpi_ec_ecdt_probe(device_t); /* AC adapter interface. */ int acpi_acad_get_acline(int *); /* Package manipulation convenience functions. */ #define ACPI_PKG_VALID(pkg, size) \ ((pkg) != NULL && (pkg)->Type == ACPI_TYPE_PACKAGE && \ (pkg)->Package.Count >= (size)) int acpi_PkgInt(ACPI_OBJECT *res, int idx, UINT64 *dst); int acpi_PkgInt32(ACPI_OBJECT *res, int idx, uint32_t *dst); int acpi_PkgStr(ACPI_OBJECT *res, int idx, void *dst, size_t size); int acpi_PkgGas(device_t dev, ACPI_OBJECT *res, int idx, int *type, int *rid, struct resource **dst, u_int flags); int acpi_PkgFFH_IntelCpu(ACPI_OBJECT *res, int idx, int *vendor, int *class, uint64_t *address, int *accsize); ACPI_HANDLE acpi_GetReference(ACPI_HANDLE scope, ACPI_OBJECT *obj); /* * Base level for BUS_ADD_CHILD. Special devices are added at orders less * than this, and normal devices at or above this level. This keeps the * probe order sorted so that things like sysresource are available before * their children need them. */ #define ACPI_DEV_BASE_ORDER 100 /* Default maximum number of tasks to enqueue. */ #ifndef ACPI_MAX_TASKS #define ACPI_MAX_TASKS MAX(32, MAXCPU * 4) #endif /* Default number of task queue threads to start. */ #ifndef ACPI_MAX_THREADS #define ACPI_MAX_THREADS 3 #endif /* Use the device logging level for ktr(4). */ #define KTR_ACPI KTR_DEV SYSCTL_DECL(_debug_acpi); /* * Map a PXM to a VM domain. * * Returns the VM domain ID if found, or -1 if not found / invalid. */ -#if MAXMEMDOM > 1 extern int acpi_map_pxm_to_vm_domainid(int pxm); -#endif extern int acpi_get_domain(device_t dev, device_t child, int *domain); extern int acpi_parse_pxm(device_t dev, int *domain); #endif /* _KERNEL */ #endif /* !_ACPIVAR_H_ */ Index: head/sys/kern/kern_cpuset.c =================================================================== --- head/sys/kern/kern_cpuset.c (revision 297747) +++ head/sys/kern/kern_cpuset.c (revision 297748) @@ -1,1284 +1,1290 @@ /*- * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ /* * cpusets provide a mechanism for creating and manipulating sets of * processors for the purpose of constraining the scheduling of threads to * specific processors. * * Each process belongs to an identified set, by default this is set 1. Each * thread may further restrict the cpus it may run on to a subset of this * named set. This creates an anonymous set which other threads and processes * may not join by number. * * The named set is referred to herein as the 'base' set to avoid ambiguity. * This set is usually a child of a 'root' set while the anonymous set may * simply be referred to as a mask. In the syscall api these are referred to * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. * * Threads inherit their set from their creator whether it be anonymous or * not. This means that anonymous sets are immutable because they may be * shared. To modify an anonymous set a new set is created with the desired * mask and the same parent as the existing anonymous set. This gives the * illusion of each thread having a private mask. * * Via the syscall apis a user may ask to retrieve or modify the root, base, * or mask that is discovered via a pid, tid, or setid. Modifying a set * modifies all numbered and anonymous child sets to comply with the new mask. * Modifying a pid or tid's mask applies only to that tid but must still * exist within the assigned parent set. * * A thread may not be assigned to a group separate from other threads in * the process. This is to remove ambiguity when the setid is queried with * a pid argument. There is no other technical limitation. * * This somewhat complex arrangement is intended to make it easy for * applications to query available processors and bind their threads to * specific processors while also allowing administrators to dynamically * reprovision by changing sets which apply to groups of processes. * * A simple application should not concern itself with sets at all and * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ static uma_zone_t cpuset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * cpuset_ref(struct cpuset *set) { refcount_acquire(&set->cs_ref); return (set); } /* * Walks up the tree from 'set' to find the root. Returns the root * referenced. */ static struct cpuset * cpuset_refroot(struct cpuset *set) { for (; set->cs_parent != NULL; set = set->cs_parent) if (set->cs_flags & CPU_SET_ROOT) break; cpuset_ref(set); return (set); } /* * Find the first non-anonymous set starting from 'set'. Returns this set * referenced. May return the passed in set with an extra ref if it is * not anonymous. */ static struct cpuset * cpuset_refbase(struct cpuset *set) { if (set->cs_id == CPUSET_INVALID) set = set->cs_parent; cpuset_ref(set); return (set); } /* * Release a reference in a context where it is safe to allocate. */ void cpuset_rel(struct cpuset *set) { cpusetid_t id; if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); id = set->cs_id; if (id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); mtx_unlock_spin(&cpuset_lock); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Deferred release must be used when in a context that is not safe to * allocate/free. This places any unreferenced sets on the list 'head'. */ static void cpuset_rel_defer(struct setlist *head, struct cpuset *set) { if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); LIST_INSERT_HEAD(head, set, cs_link); mtx_unlock_spin(&cpuset_lock); } /* * Complete a deferred release. Removes the set from the list provided to * cpuset_rel_defer. */ static void cpuset_rel_complete(struct cpuset *set) { LIST_REMOVE(set, cs_link); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); } /* * Find a set based on an id. Returns it with a ref. */ static struct cpuset * cpuset_lookup(cpusetid_t setid, struct thread *td) { struct cpuset *set; if (setid == CPUSET_INVALID) return (NULL); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(set, &cpuset_ids, cs_link) if (set->cs_id == setid) break; if (set) cpuset_ref(set); mtx_unlock_spin(&cpuset_lock); KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { struct cpuset *jset, *tset; jset = td->td_ucred->cr_prison->pr_cpuset; for (tset = set; tset != NULL; tset = tset->cs_parent) if (tset == jset) break; if (tset == NULL) { cpuset_rel(set); set = NULL; } } return (set); } /* * Create a set in the space provided in 'set' with the provided parameters. * The set is returned with a single ref. May return EDEADLK if the set * will have no valid cpu based on restrictions from the parent. */ static int _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, cpusetid_t id) { if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); CPU_AND(&set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); mtx_unlock_spin(&cpuset_lock); return (0); } /* * Create a new non-anonymous set with the requested parent and mask. May * return failures if the mask is invalid or a new number can not be * allocated. */ static int cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) { struct cpuset *set; cpusetid_t id; int error; id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); error = _cpuset_create(set, parent, mask, id); if (error == 0) return (0); free_unr(cpuset_unr, id); uma_zfree(cpuset_zone, set); return (error); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask) { struct cpuset *nset; cpuset_t newmask; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); if (check_mask) { if (!CPU_OVERLAP(&set->cs_mask, mask)) return (EDEADLK); CPU_COPY(&set->cs_mask, &newmask); CPU_AND(&newmask, mask); } else CPU_COPY(mask, &newmask); error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update(struct cpuset *set, cpuset_t *mask) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); CPU_AND(&set->cs_mask, mask); LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update(nset, &set->cs_mask); return; } /* * Modify the set 'set' to use a copy of the mask provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify(struct cpuset *set, cpuset_t *mask) { struct cpuset *root; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); /* * Verify that we have access to this set of * cpus. */ root = set->cs_parent; if (root && !CPU_SUBSET(&root->cs_mask, mask)) return (EINVAL); mtx_lock_spin(&cpuset_lock); error = cpuset_testupdate(set, mask, 0); if (error) goto out; CPU_COPY(mask, &set->cs_mask); cpuset_update(set, mask); out: mtx_unlock_spin(&cpuset_lock); return (error); } /* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also * checks for permission via p_cansched(). * * For WHICH_SET returns a valid set with a new reference. * * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { struct cpuset *set; struct thread *td; struct proc *p; int error; *pp = p = NULL; *tdp = td = NULL; *setp = set = NULL; switch (which) { case CPU_WHICH_PID: if (id == -1) { PROC_LOCK(curproc); p = curproc; break; } if ((p = pfind(id)) == NULL) return (ESRCH); break; case CPU_WHICH_TID: if (id == -1) { PROC_LOCK(curproc); p = curproc; td = curthread; break; } td = tdfind(id, -1); if (td == NULL) return (ESRCH); p = td->td_proc; break; case CPU_WHICH_CPUSET: if (id == -1) { thread_lock(curthread); set = cpuset_refbase(curthread->td_cpuset); thread_unlock(curthread); } else set = cpuset_lookup(id, curthread); if (set) { *setp = set; return (0); } return (ESRCH); case CPU_WHICH_JAIL: { /* Find `set' for prison with given id. */ struct prison *pr; sx_slock(&allprison_lock); pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); cpuset_ref(pr->pr_cpuset); *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); return (0); } case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (0); default: return (EINVAL); } error = p_cansched(curthread, p); if (error) { PROC_UNLOCK(p); return (error); } if (td == NULL) td = FIRST_THREAD_IN_PROC(p); *pp = p; *tdp = td; return (0); } /* * Create an anonymous set with the provided mask in the space provided by * 'fset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask) { struct cpuset *parent; if (set->cs_id == CPUSET_INVALID) parent = set->cs_parent; else parent = set; if (!CPU_SUBSET(&parent->cs_mask, mask)) return (EDEADLK); return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); } /* * Handle two cases for replacing the base set or mask of an entire process. * * 1) Set is non-null and mask is null. This reparents all anonymous sets * to the provided set and replaces all non-anonymous td_cpusets with the * provided set. * 2) Mask is non-null and set is null. This replaces or creates anonymous * sets for every thread with the existing base as a parent. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) { struct setlist freelist; struct setlist droplist; struct cpuset *tdset; struct cpuset *nset; struct thread *td; struct proc *p; int threads; int nfree; int error; /* * The algorithm requires two passes due to locking considerations. * * 1) Lookup the process and acquire the locks in the required order. * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ LIST_INIT(&freelist); LIST_INIT(&droplist); nfree = 0; for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; if (nfree >= p->p_numthreads) break; threads = p->p_numthreads; PROC_UNLOCK(p); for (; nfree < threads; nfree++) { nset = uma_zalloc(cpuset_zone, M_WAITOK); LIST_INSERT_HEAD(&freelist, nset, cs_link); } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); tdset = td->td_cpuset; /* * Verify that a new mask doesn't specify cpus outside of * the set the thread is a member of. */ if (mask) { if (tdset->cs_id == CPUSET_INVALID) tdset = tdset->cs_parent; if (!CPU_SUBSET(&tdset->cs_mask, mask)) error = EDEADLK; /* * Verify that a new set won't leave an existing thread * mask without a cpu to run on. It can, however, restrict * the set. */ } else if (tdset->cs_id == CPUSET_INVALID) { if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) error = EDEADLK; } thread_unlock(td); if (error) goto unlock_out; } /* * Replace each thread's cpuset while using deferred release. We * must do this because the thread lock must be held while operating * on the thread and this limits the type of operations allowed. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); /* * If we presently have an anonymous set or are applying a * mask we must create an anonymous shadow set. That is * either parented to our existing base or the supplied set. * * If we have a base set with no anonymous shadow we simply * replace it outright. */ tdset = td->td_cpuset; if (tdset->cs_id == CPUSET_INVALID || mask) { nset = LIST_FIRST(&freelist); LIST_REMOVE(nset, cs_link); if (mask) error = cpuset_shadow(tdset, nset, mask); else error = _cpuset_create(nset, set, &tdset->cs_mask, CPUSET_INVALID); if (error) { LIST_INSERT_HEAD(&freelist, nset, cs_link); thread_unlock(td); break; } } else nset = cpuset_ref(set); cpuset_rel_defer(&droplist, tdset); td->td_cpuset = nset; sched_affinity(td); thread_unlock(td); } unlock_out: PROC_UNLOCK(p); out: while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); while ((nset = LIST_FIRST(&freelist)) != NULL) { LIST_REMOVE(nset, cs_link); uma_zfree(cpuset_zone, nset); } return (error); } /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { char *tbuf; size_t i, bytesp, bufsiz; tbuf = buf; bytesp = 0; bufsiz = CPUSETBUFSIZ; for (i = 0; i < (_NCPUWORDS - 1); i++) { bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]); bufsiz -= bytesp; tbuf += bytesp; } snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]); return (buf); } /* * Build a valid cpuset_t object from a string representation. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ int cpusetobj_strscan(cpuset_t *set, const char *buf) { u_int nwords; int i, ret; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); /* Allow to pass a shorter version of the mask when necessary. */ nwords = 1; for (i = 0; buf[i] != '\0'; i++) if (buf[i] == ',') nwords++; if (nwords > _NCPUWORDS) return (-1); CPU_ZERO(set); for (i = 0; i < (nwords - 1); i++) { ret = sscanf(buf, "%lx,", &set->__bits[i]); if (ret == 0 || ret == -1) return (-1); buf = strstr(buf, ","); if (buf == NULL) return (-1); buf++; } ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]); if (ret == 0 || ret == -1) return (-1); return (0); } /* * Apply an anonymous mask to a single thread. */ int cpuset_setthread(lwpid_t id, cpuset_t *mask) { struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; nset = uma_zalloc(cpuset_zone, M_WAITOK); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); error = cpuset_shadow(td->td_cpuset, nset, mask); if (error == 0) { set = td->td_cpuset; td->td_cpuset = nset; sched_affinity(td); nset = NULL; } thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: if (nset) uma_zfree(cpuset_zone, nset); return (error); } /* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { struct cpuset *nset, *rset; struct cpuset *parent, *old_set; struct thread *td; struct proc *p; cpusetid_t cs_id; cpuset_t mask; int error; nset = uma_zalloc(cpuset_zone, M_WAITOK); rset = uma_zalloc(cpuset_zone, M_WAITOK); cs_id = CPUSET_INVALID; CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set); if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID)) goto out; /* cpuset_which() returns with PROC_LOCK held. */ old_set = td->td_cpuset; if (cpu == NOCPU) { /* * roll back to default set. We're not using cpuset_shadow() * here because we can fail CPU_SUBSET() check. This can happen * if default set does not contain all CPUs. */ error = _cpuset_create(nset, cpuset_default, &mask, CPUSET_INVALID); goto applyset; } if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID && old_set->cs_parent->cs_id == 1)) { /* * Current set is either default (1) or * shadowed version of default set. * * Allocate new root set to be able to shadow it * with any mask. */ error = _cpuset_create(rset, cpuset_zero, &cpuset_zero->cs_mask, cs_id); if (error != 0) { PROC_UNLOCK(p); goto out; } rset->cs_flags |= CPU_SET_ROOT; parent = rset; rset = NULL; cs_id = CPUSET_INVALID; } else { /* Assume existing set was already allocated by previous call */ parent = old_set; old_set = NULL; } error = cpuset_shadow(parent, nset, &mask); applyset: if (error == 0) { thread_lock(td); td->td_cpuset = nset; sched_affinity(td); thread_unlock(td); nset = NULL; } else old_set = NULL; PROC_UNLOCK(p); if (old_set != NULL) cpuset_rel(old_set); out: if (nset != NULL) uma_zfree(cpuset_zone, nset); if (rset != NULL) uma_zfree(cpuset_zone, rset); if (cs_id != CPUSET_INVALID) free_unr(cpuset_unr, cs_id); return (error); } /* * Creates system-wide cpusets and the cpuset for thread0 including two * sets: * * 0 - The root set which should represent all valid processors in the * system. It is initially created with a mask of all processors * because we don't know what processors are valid until cpuset_init() * runs. This set is immutable. * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. */ struct cpuset * cpuset_thread0(void) { struct cpuset *set; - int error; + int error, i; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); /* * Create the root system set for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); CPU_FILL(&set->cs_mask); LIST_INIT(&set->cs_children); LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); set->cs_ref = 1; set->cs_flags = CPU_SET_ROOT; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default, modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK); error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. */ cpuset_unr = new_unrhdr(2, INT_MAX, NULL); - /* MD Code is responsible for initializing sets if vm_ndomains > 1. */ - if (vm_ndomains == 1) - CPU_COPY(&all_cpus, &cpuset_domain[0]); + /* + * If MD code has not initialized per-domain cpusets, place all + * CPUs in domain 0. + */ + for (i = 0; i < MAXMEMDOM; i++) + if (!CPU_EMPTY(&cpuset_domain[i])) + goto domains_set; + CPU_COPY(&all_cpus, &cpuset_domain[0]); +domains_set: return (set); } /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. * * We are not going to reparent the td to it. Use cpuset_setproc_update_set() * for that. * * In case of no error, returns the set in *setp locked with a reference. */ int cpuset_create_root(struct prison *pr, struct cpuset **setp) { struct cpuset *set; int error; KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data", __func__, __LINE__)); /* Mark the set as root. */ set = *setp; set->cs_flags |= CPU_SET_ROOT; return (0); } int cpuset_setproc_update_set(struct proc *p, struct cpuset *set) { int error; KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); error = cpuset_setproc(p->p_pid, set, NULL); if (error) return (error); cpuset_rel(set); return (0); } /* * This is called once the final set of system cpus is known. Modifies * the root set and all children and mark the root read-only. */ static void cpuset_init(void *arg) { cpuset_t mask; mask = all_cpus; if (cpuset_modify(cpuset_zero, &mask)) panic("Can't set initial cpuset mask.\n"); cpuset_zero->cs_flags |= CPU_SET_RDONLY; } SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); #ifndef _SYS_SYSPROTO_H_ struct cpuset_args { cpusetid_t *setid; }; #endif int sys_cpuset(struct thread *td, struct cpuset_args *uap) { struct cpuset *root; struct cpuset *set; int error; thread_lock(td); root = cpuset_refroot(td->td_cpuset); thread_unlock(td); error = cpuset_create(&set, root, &root->cs_mask); cpuset_rel(root); if (error) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) error = cpuset_setproc(-1, set, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setid_args { cpuwhich_t which; id_t id; cpusetid_t setid; }; #endif int sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) { struct cpuset *set; int error; /* * Presently we only support per-process sets. */ if (uap->which != CPU_WHICH_PID) return (EINVAL); set = cpuset_lookup(uap->setid, td); if (set == NULL) return (ESRCH); error = cpuset_setproc(uap->id, set, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getid_args { cpulevel_t level; cpuwhich_t which; id_t id; cpusetid_t *setid; }; #endif int sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpusetid_t id; int error; if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET) return (EINVAL); error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error) return (error); switch (uap->which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_refbase(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (EINVAL); } switch (uap->level) { case CPU_LEVEL_ROOT: nset = cpuset_refroot(set); cpuset_rel(set); set = nset; break; case CPU_LEVEL_CPUSET: break; case CPU_LEVEL_WHICH: break; } id = set->cs_id; cpuset_rel(set); if (error == 0) error = copyout(&id, uap->setid, sizeof(id)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; cpuset_t *mask; }; #endif int sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) { struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct proc *p; cpuset_t *mask; int error; size_t size; if (uap->cpusetsize < sizeof(cpuset_t) || uap->cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); size = uap->cpusetsize; mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error) goto out; switch (uap->level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (uap->which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (uap->level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); CPU_COPY(&nset->cs_mask, mask); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (uap->which) { case CPU_WHICH_TID: thread_lock(ttd); CPU_COPY(&ttd->td_cpuset->cs_mask, mask); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); CPU_OR(mask, &ttd->td_cpuset->cs_mask); thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: CPU_COPY(&set->cs_mask, mask); break; case CPU_WHICH_IRQ: error = intr_getaffinity(uap->id, mask); break; case CPU_WHICH_DOMAIN: - if (uap->id < 0 || uap->id >= vm_ndomains) + if (uap->id < 0 || uap->id >= MAXMEMDOM) error = ESRCH; else CPU_COPY(&cpuset_domain[uap->id], mask); break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); if (error == 0) error = copyout(mask, uap->mask, size); out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; const cpuset_t *mask; }; #endif int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpuset_t *mask; int error; if (uap->cpusetsize < sizeof(cpuset_t) || uap->cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO); error = copyin(uap->mask, mask, uap->cpusetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (uap->cpusetsize > sizeof(cpuset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += uap->cpusetsize; cp += sizeof(cpuset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } switch (uap->level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error) break; switch (uap->which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (uap->level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify(nset, mask); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (uap->which) { case CPU_WHICH_TID: error = cpuset_setthread(uap->id, mask); break; case CPU_WHICH_PID: error = cpuset_setproc(uap->id, NULL, mask); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify(set, mask); cpuset_rel(set); } break; case CPU_WHICH_IRQ: error = intr_setaffinity(uap->id, mask); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifdef DDB void ddb_display_cpuset(const cpuset_t *set) { int cpu, once; for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { if (CPU_ISSET(cpu, set)) { if (once == 0) { db_printf("%d", cpu); once = 1; } else db_printf(",%d", cpu); } } if (once == 0) db_printf(""); } DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; LIST_FOREACH(set, &cpuset_ids, cs_link) { db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, set->cs_ref, set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); db_printf(" mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); if (db_pager_quit) break; } } #endif /* DDB */ Index: head/sys/vm/vm_domain.c =================================================================== --- head/sys/vm/vm_domain.c (revision 297747) +++ head/sys/vm/vm_domain.c (revision 297748) @@ -1,384 +1,405 @@ /*- * Copyright (c) 2015 Adrian Chadd . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_ddb.h" #include #include #include #include #include #include -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static __inline int vm_domain_rr_selectdomain(int skip_domain) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC struct thread *td; td = curthread; td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; /* * If skip_domain is provided then skip over that * domain. This is intended for round robin variants * which first try a fixed domain. */ if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) { td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; } return (td->td_dom_rr_idx); #else return (0); #endif } /* * This implements a very simple set of VM domain memory allocation * policies and iterators. */ /* * A VM domain policy represents a desired VM domain policy. * Iterators implement searching through VM domains in a specific * order. */ /* * When setting a policy, the caller must establish their own * exclusive write protection for the contents of the domain * policy. */ int vm_domain_policy_init(struct vm_domain_policy *vp) { bzero(vp, sizeof(*vp)); vp->p.policy = VM_POLICY_NONE; vp->p.domain = -1; return (0); } int vm_domain_policy_set(struct vm_domain_policy *vp, vm_domain_policy_type_t vt, int domain) { seq_write_begin(&vp->seq); vp->p.policy = vt; vp->p.domain = domain; seq_write_end(&vp->seq); return (0); } /* * Take a local copy of a policy. * * The destination policy isn't write-barriered; this is used * for doing local copies into something that isn't shared. */ void vm_domain_policy_localcopy(struct vm_domain_policy *dst, const struct vm_domain_policy *src) { seq_t seq; for (;;) { seq = seq_read(&src->seq); *dst = *src; if (seq_consistent(&src->seq, seq)) return; cpu_spinwait(); } } /* * Take a write-barrier copy of a policy. * * The destination policy is write -barriered; this is used * for doing copies into policies that may be read by other * threads. */ void vm_domain_policy_copy(struct vm_domain_policy *dst, const struct vm_domain_policy *src) { seq_t seq; struct vm_domain_policy d; for (;;) { seq = seq_read(&src->seq); d = *src; if (seq_consistent(&src->seq, seq)) { seq_write_begin(&dst->seq); dst->p.domain = d.p.domain; dst->p.policy = d.p.policy; seq_write_end(&dst->seq); return; } cpu_spinwait(); } } int vm_domain_policy_validate(const struct vm_domain_policy *vp) { switch (vp->p.policy) { case VM_POLICY_NONE: case VM_POLICY_ROUND_ROBIN: case VM_POLICY_FIRST_TOUCH: case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: if (vp->p.domain == -1) return (0); return (-1); case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: +#ifdef VM_NUMA_ALLOC if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains) return (0); +#else + if (vp->p.domain == 0) + return (0); +#endif return (-1); default: return (-1); } return (-1); } int vm_domain_policy_cleanup(struct vm_domain_policy *vp) { /* For now, empty */ return (0); } int vm_domain_iterator_init(struct vm_domain_iterator *vi) { /* Nothing to do for now */ return (0); } /* * Manually setup an iterator with the given details. */ int vm_domain_iterator_set(struct vm_domain_iterator *vi, vm_domain_policy_type_t vt, int domain) { +#ifdef VM_NUMA_ALLOC switch (vt) { case VM_POLICY_FIXED_DOMAIN: vi->policy = VM_POLICY_FIXED_DOMAIN; vi->domain = domain; vi->n = 1; break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; vi->domain = domain; vi->n = vm_ndomains; break; case VM_POLICY_FIRST_TOUCH: vi->policy = VM_POLICY_FIRST_TOUCH; vi->domain = PCPU_GET(domain); vi->n = 1; break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; vi->domain = PCPU_GET(domain); vi->n = vm_ndomains; break; case VM_POLICY_ROUND_ROBIN: default: vi->policy = VM_POLICY_ROUND_ROBIN; vi->domain = -1; vi->n = vm_ndomains; break; } +#else + vi->domain = 0; + vi->n = 1; +#endif return (0); } /* * Setup an iterator based on the given policy. */ static inline void _vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, const struct vm_domain_policy *vt) { + +#ifdef VM_NUMA_ALLOC /* * Initialise the iterator. * * For first-touch, the initial domain is set * via the current thread CPU domain. * * For fixed-domain, it's assumed that the * caller has initialised the specific domain * it is after. */ switch (vt->p.policy) { case VM_POLICY_FIXED_DOMAIN: vi->policy = vt->p.policy; vi->domain = vt->p.domain; vi->n = 1; break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: vi->policy = vt->p.policy; vi->domain = vt->p.domain; vi->n = vm_ndomains; break; case VM_POLICY_FIRST_TOUCH: vi->policy = vt->p.policy; vi->domain = PCPU_GET(domain); vi->n = 1; break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: vi->policy = vt->p.policy; vi->domain = PCPU_GET(domain); vi->n = vm_ndomains; break; case VM_POLICY_ROUND_ROBIN: default: /* * Default to round-robin policy. */ vi->policy = VM_POLICY_ROUND_ROBIN; vi->domain = -1; vi->n = vm_ndomains; break; } +#else + vi->domain = 0; + vi->n = 1; +#endif } void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, const struct vm_domain_policy *vt) { seq_t seq; struct vm_domain_policy vt_lcl; for (;;) { seq = seq_read(&vt->seq); vt_lcl = *vt; if (seq_consistent(&vt->seq, seq)) { _vm_domain_iterator_set_policy(vi, &vt_lcl); return; } cpu_spinwait(); } } /* * Return the next VM domain to use. * * Returns 0 w/ domain set to the next domain to use, or * -1 to indicate no more domains are available. */ int vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain) { /* General catch-all */ if (vi->n <= 0) return (-1); +#ifdef VM_NUMA_ALLOC switch (vi->policy) { case VM_POLICY_FIXED_DOMAIN: case VM_POLICY_FIRST_TOUCH: *domain = vi->domain; vi->n--; break; case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: /* * XXX TODO: skip over the rr'ed domain * if it equals the one we started with. */ if (vi->n == vm_ndomains) *domain = vi->domain; else *domain = vm_domain_rr_selectdomain(vi->domain); vi->n--; break; case VM_POLICY_ROUND_ROBIN: default: *domain = vm_domain_rr_selectdomain(-1); vi->n--; break; } +#else + *domain = 0; + vi->n--; +#endif return (0); } /* * Returns 1 if the iteration is done, or 0 if it has not. * This can only be called after at least one loop through * the iterator. Ie, it's designed to be used as a tail * check of a loop, not the head check of a loop. */ int vm_domain_iterator_isdone(struct vm_domain_iterator *vi) { return (vi->n <= 0); } int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi) { return (0); } Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 297747) +++ head/sys/vm/vm_pageout.c (revision 297748) @@ -1,1850 +1,1850 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif static int vm_max_launder = 32; static int vm_pageout_update_period; static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; static int vm_swap_idle_enabled = 0; #else static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); SYSCTL_INT(_vm, OID_AUTO, max_launder, CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it wirte busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queues * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2*vm_pageout_page_count], pb, ps; int pageout_count; int ib, is, page_base; vm_pindex_t pindex = m->pindex; vm_page_lock_assert(m, MA_OWNED); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); /* * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP * with the new swapper, but we could have serious problems paging * out other object types if there is insufficient memory. * * Unfortunately, checking free memory here is far too late, so the * check has been moved up a procedural level. */ /* * Can't clean the page if it's busy or held. */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying * align the clusters (which leave sporatic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib && pageout_count < vm_pageout_page_count) { vm_page_t p; if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * alignment boundry, stop here and switch directions. Do * not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { vm_page_t p; if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past a page boundry. This catches boundry * conditions. */ if (ib && pageout_count < vm_pageout_page_count) goto more; /* * we allow reads during pageouts... */ return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_sbusy(mc[i]); pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_lock(mt); vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (vm_page_busied(p)) continue; PCPU_INC(cnt.v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } if (p->queue != PQ_ACTIVE && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; } else if (p->queue == PQ_ACTIVE) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } } else if (p->queue == PQ_INACTIVE) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (m->queue != PQ_INACTIVE || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or held while the object * and page locks were released. */ if (vm_page_busied(m) || m->hold_count != 0) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if (vm_pageout_cluster(m) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * vm_pageout_scan does the dirty work for the pageout daemon. * * pass 0 - Update active LRU/deactivate pages * pass 1 - Move inactive to cache or free * pass 2 - Launder dirty pages */ static void vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; int act_delta, addl_page_shortage, deficit, error, maxlaunder, maxscan; int page_shortage, scan_tick, scanned, starting_page_shortage; int vnodes_skipped; boolean_t pageout_ok, queues_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages we want to either free or move * to the cache. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* * maxlaunder limits the number of dirty pages we flush per scan. * For most systems a smaller value (16 or 32) is more robust under * extreme memory and disk pressure because any unnecessary writes * to disk can result in extreme performance degredation. However, * systems with excessive dirty pages (especially when MAP_NOSYNC is * used) will die horribly with limited laundering. If the pageout * daemon cannot clean enough pages in the first pass, we let it go * all out in succeeding passes. */ if ((maxlaunder = vm_max_launder) <= 1) maxlaunder = 1; if (pass > 1) maxlaunder = 10000; vnodes_skipped = 0; /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queues_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queues_locked, ("unlocked queues")); KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of the page_shortage for the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(m->hold_count == 0, ("Held page %p", m)); /* * We unlock the inactive page queue, invalidating the * 'next' pointer. Use our marker to remember our * place. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); vm_pagequeue_unlock(pq); queues_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) goto requeue_page; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of placing it onto the cache queue. If, * however, any of the page's mappings allow write access, * then the page may still be modified until the last of those * mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } if (m->dirty == 0) { /* * Clean pages can be freed. */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; } else if ((object->flags & OBJ_DEAD) != 0) { /* * Leave dirty pages from dead objects at the front of * the queue. They are being paged out and freed by * the thread that destroyed the object. They will * leave the queue shortly after the scan finishes, so * they should be discounted from the inactive count. */ addl_page_shortage++; } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { /* * Dirty pages need to be paged out, but flushing * a page is extremely expensive versus freeing * a clean page. Rather then artificially limiting * the number of pages we can flush, we instead give * dirty pages extra priority on the inactive queue * by forcing them to be cycled through the queue * twice before being flushed, after which the * (now clean) page will cycle through once more * before being freed. This significantly extends * the thrash point for a heavily loaded machine. */ m->flags |= PG_WINATCFLS; requeue_page: vm_pagequeue_lock(pq); queues_locked = TRUE; vm_page_requeue_locked(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if * we encounter them, to keep the system stable. * Normally this number is small, but under extreme * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = TRUE; else if (disable_swap_pageouts) pageout_ok = FALSE; else if (defer_swap_pageouts) pageout_ok = vm_page_count_min(); else pageout_ok = TRUE; if (!pageout_ok) goto requeue_page; error = vm_pageout_clean(m); /* * Decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ if (error == 0) { page_shortage--; maxlaunder--; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } else if (error == EBUSY) { addl_page_shortage++; } vm_page_lock_assert(m, MA_NOTOWNED); goto relock_queues; } drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); relock_queues: if (!queues_locked) { vm_pagequeue_lock(pq); queues_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't cache or free the targeted * number of pages. */ if (vm_swap_enabled && page_shortage > 0) vm_req_vmdaemon(VM_SWAP_NORMAL); #endif /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't cache or free enough pages. */ if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - vm_cnt.v_free_min) (void)speedup_syncer(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ page_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + vm_paging_target() + deficit + addl_page_shortage; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (page_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for pagedaemon pages is done after checking the * page for eligibility... */ PCPU_INC(cnt.v_pdpages); /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Unlocked object ref count check. Two races are possible. * 1) The ref was transitioning to zero and we saw non-zero, * the pmap bits will be checked unnecessarily. * 2) The ref was transitioning to one and we saw zero. * The page lock prevents a new reference to this page so * we need not check the reference bits. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* * Move this page to the tail of the active or inactive * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); vm_page_deactivate(m); page_shortage--; } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } } #endif } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD(p); if (!vm_map_trylock_read(&vm->vm_map)) { _PRELE(p); PROC_UNLOCK(p); vmspace_free(vm); continue; } PROC_UNLOCK(p); size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); wakeup(&vm_cnt.v_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *domain; int domidx; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { /* * If we have enough free memory, wakeup waiters. Do * not clear vm_pages_needed until we reach our target, * otherwise we may be woken up over and over again and * waste a lot of cpu. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_pages_needed && !vm_page_count_min()) { if (!vm_paging_needed()) vm_pages_needed = 0; wakeup(&vm_cnt.v_free_count); } if (vm_pages_needed) { /* * We're still not done. Either vm_pages_needed was * set by another thread during the previous scan * (typically, this happens during a level 0 scan) or * vm_pages_needed was already set and the scan failed * to free enough pages. If we haven't yet performed * a level >= 2 scan (unlimited dirty cleaning), then * upgrade the level and scan again now. Otherwise, * sleep a bit and try again later. While sleeping, * vm_pages_needed can be cleared. */ if (domain->vmd_pass > 1) msleep(&vm_pages_needed, &vm_page_queue_free_mtx, PVM, "psleep", hz / 2); } else { /* * Good enough, sleep until required to refresh * stats. */ msleep(&vm_pages_needed, &vm_page_queue_free_mtx, PVM, "psleep", hz); } if (vm_pages_needed) { vm_cnt.v_pdwakeups++; domain->vmd_pass++; } else domain->vmd_pass = 0; mtx_unlock(&vm_page_queue_free_mtx); vm_pageout_scan(domain, domain->vmd_pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init(void) { /* * Initialize some paging parameters. */ vm_cnt.v_interrupt_free_min = 2; if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vm_cnt.v_page_count > 1024) vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else vm_cnt.v_free_min = 4; vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vm_cnt.v_interrupt_free_min; vm_cnt.v_free_reserved = vm_pageout_page_count + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; vm_cnt.v_free_min += vm_cnt.v_free_reserved; vm_cnt.v_free_severe += vm_cnt.v_free_reserved; vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC int i; #endif swap_pager_swap_init(); -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } } #endif error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Unless the free page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &vm_cnt.v_free_count following this function unless * the free page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup(void) { if (!vm_pages_needed && curthread->td_proc != pageproc) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); PROC_UNLOCK(p); if (vm == NULL) continue; size = vmspace_resident_count(vm); if (size >= limit) { vm_pageout_map_deactivate_pages( &vm->vm_map, limit); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_pageout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) goto again; } } #endif /* !defined(NO_SWAPPING) */ Index: head/sys/vm/vm_phys.c =================================================================== --- head/sys/vm/vm_phys.c (revision 297747) +++ head/sys/vm/vm_phys.c (revision 297748) @@ -1,1525 +1,1529 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Physical memory system implementation * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include -#if MAXMEMDOM > 1 #include -#endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); +#ifdef VM_NUMA_ALLOC struct mem_affinity *mem_affinity; int *mem_locality; +#endif int vm_ndomains = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; struct vm_phys_fictitious_seg; static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, struct vm_phys_fictitious_seg *); RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = RB_INITIALIZER(_vm_phys_fictitious_tree); struct vm_phys_fictitious_seg { RB_ENTRY(vm_phys_fictitious_seg) node; /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; }; RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, vm_phys_fictitious_cmp); static struct rwlock vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; static int vm_nfreelists; /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int vm_freelist_to_flind[VM_NFREELIST]; CTASSERT(VM_FREELIST_DEFAULT == 0); #ifdef VM_FREELIST_ISADMA #define VM_ISADMA_BOUNDARY 16777216 #endif #ifdef VM_FREELIST_DMA32 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) #endif /* * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about * the ordering of the free list boundaries. */ #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY) CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY); #endif #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); #endif static int cnt_prezero; SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, "The number of physical pages prezeroed at idle time"); static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); #endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); /* * Default to first-touch + round-robin. */ static struct mtx vm_default_policy_mtx; MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", MTX_DEF); -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); #else /* Use round-robin so the domain policy code will only try once per allocation */ static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); #endif static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); static int sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) { char policy_name[32]; int error; mtx_lock(&vm_default_policy_mtx); /* Map policy to output string */ switch (vm_default_policy.p.policy) { case VM_POLICY_FIRST_TOUCH: strcpy(policy_name, "first-touch"); break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: strcpy(policy_name, "first-touch-rr"); break; case VM_POLICY_ROUND_ROBIN: default: strcpy(policy_name, "rr"); break; } mtx_unlock(&vm_default_policy_mtx); error = sysctl_handle_string(oidp, &policy_name[0], sizeof(policy_name), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vm_default_policy_mtx); /* Set: match on the subset of policies that make sense as a default */ if (strcmp("first-touch-rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); } else if (strcmp("first-touch", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH, 0); } else if (strcmp("rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_ROUND_ROBIN, 0); } else { error = EINVAL; goto finish; } error = 0; finish: mtx_unlock(&vm_default_policy_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_vm_default_policy, "A", "Default policy (rr, first-touch, first-touch-rr"); /* * Red-black tree helpers for vm fictitious range management. */ static inline int vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, struct vm_phys_fictitious_seg *range) { KASSERT(range->start != 0 && range->end != 0, ("Invalid range passed on search for vm_fictitious page")); if (p->start >= range->end) return (1); if (p->start < range->start) return (-1); return (0); } static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, struct vm_phys_fictitious_seg *p2) { /* Check if this is a search for a page */ if (p1->end == 0) return (vm_phys_fictitious_in_range(p1, p2)); KASSERT(p2->end != 0, ("Invalid range passed as second parameter to vm fictitious comparison")); /* Searching to add a new range */ if (p1->end <= p2->start) return (-1); if (p1->start >= p2->end) return (1); panic("Trying to add overlapping vm fictitious ranges:\n" "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); } static __inline int vm_rr_selectdomain(void) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC struct thread *td; td = curthread; td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; return (td->td_dom_rr_idx); #else return (0); #endif } /* * Initialise a VM domain iterator. * * Check the thread policy, then the proc policy, * then default to the system policy. * * Later on the various layers will have this logic * plumbed into them and the phys code will be explicitly * handed a VM domain policy to use. */ static void vm_policy_iterator_init(struct vm_domain_iterator *vi) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC struct vm_domain_policy lcl; #endif vm_domain_iterator_init(vi); -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC /* Copy out the thread policy */ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Thread policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } vm_domain_policy_localcopy(&lcl, &curthread->td_proc->p_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Process policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } #endif /* Use system default policy */ vm_domain_iterator_set_policy(vi, &vm_default_policy); } static void vm_policy_iterator_finish(struct vm_domain_iterator *vi) { vm_domain_iterator_cleanup(vi); } boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { struct vm_phys_seg *s; int idx; while ((idx = ffsl(mask)) != 0) { idx--; /* ffsl counts from 1 */ mask &= ~(1UL << idx); s = &vm_phys_segs[idx]; if (low < s->end && high > s->start) return (TRUE); } return (FALSE); } /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. */ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_freelist *fl; int dom, error, flind, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); for (dom = 0; dom < vm_ndomains; dom++) { sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { sbuf_printf(&sbuf, "\nFREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, " | POOL %d", pind); sbuf_printf(&sbuf, "\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, "-- -- "); sbuf_printf(&sbuf, "--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); } sbuf_printf(&sbuf, "\n"); } } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Outputs the set of physical memory segments. */ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_phys_seg *seg; int error, segind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); for (segind = 0; segind < vm_phys_nsegs; segind++) { sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); seg = &vm_phys_segs[segind]; sbuf_printf(&sbuf, "start: %#jx\n", (uintmax_t)seg->start); sbuf_printf(&sbuf, "end: %#jx\n", (uintmax_t)seg->end); sbuf_printf(&sbuf, "domain: %d\n", seg->domain); sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Return affinity, or -1 if there's no affinity information. */ int vm_phys_mem_affinity(int f, int t) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); #else return (-1); #endif } -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC /* * Outputs the VM locality table. */ static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; int error, i, j; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n"); for (i = 0; i < vm_ndomains; i++) { sbuf_printf(&sbuf, "%d: ", i); for (j = 0; j < vm_ndomains; j++) { sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); } sbuf_printf(&sbuf, "\n"); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } #endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { m->order = order; if (tail) TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); else TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); fl[order].lcnt++; } static void vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { TAILQ_REMOVE(&fl[order].pl, m, plinks.q); fl[order].lcnt--; m->order = VM_NFREEORDER; } /* * Create a physical memory segment. */ static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) { struct vm_phys_seg *seg; KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); KASSERT(domain < vm_ndomains, ("vm_phys_create_seg: invalid domain provided")); seg = &vm_phys_segs[vm_phys_nsegs++]; while (seg > vm_phys_segs && (seg - 1)->start >= end) { *seg = *(seg - 1); seg--; } seg->start = start; seg->end = end; seg->domain = domain; } static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { +#ifdef VM_NUMA_ALLOC int i; if (mem_affinity == NULL) { _vm_phys_create_seg(start, end, 0); return; } for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); if (mem_affinity[i].end <= start) continue; if (mem_affinity[i].start > start) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { _vm_phys_create_seg(start, end, mem_affinity[i].domain); break; } _vm_phys_create_seg(start, mem_affinity[i].end, mem_affinity[i].domain); start = mem_affinity[i].end; } +#else + _vm_phys_create_seg(start, end, 0); +#endif } /* * Add a physical memory segment. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t paddr; KASSERT((start & PAGE_MASK) == 0, ("vm_phys_define_seg: start is not page aligned")); KASSERT((end & PAGE_MASK) == 0, ("vm_phys_define_seg: end is not page aligned")); /* * Split the physical memory segment if it spans two or more free * list boundaries. */ paddr = start; #ifdef VM_FREELIST_ISADMA if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) { vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY); paddr = VM_ISADMA_BOUNDARY; } #endif #ifdef VM_FREELIST_LOWMEM if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); paddr = VM_LOWMEM_BOUNDARY; } #endif #ifdef VM_FREELIST_DMA32 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); paddr = VM_DMA32_BOUNDARY; } #endif vm_phys_create_seg(paddr, end); } /* * Initialize the physical memory allocator. * * Requires that vm_page_array is initialized! */ void vm_phys_init(void) { struct vm_freelist *fl; struct vm_phys_seg *seg; u_long npages; int dom, flind, freelist, oind, pind, segind; /* * Compute the number of free lists, and generate the mapping from the * manifest constants VM_FREELIST_* to the free list indices. * * Initially, the entries of vm_freelist_to_flind[] are set to either * 0 or 1 to indicate which free lists should be created. */ npages = 0; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1; else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; else #endif #ifdef VM_FREELIST_DMA32 if ( #ifdef VM_DMA32_NPAGES_THRESHOLD /* * Create the DMA32 free list only if the amount of * physical memory above physical address 4G exceeds the * given threshold. */ npages > VM_DMA32_NPAGES_THRESHOLD && #endif seg->end <= VM_DMA32_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; else #endif { npages += atop(seg->end - seg->start); vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; } } /* Change each entry into a running total of the free lists. */ for (freelist = 1; freelist < VM_NFREELIST; freelist++) { vm_freelist_to_flind[freelist] += vm_freelist_to_flind[freelist - 1]; } vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); /* Change each entry into a free list index. */ for (freelist = 0; freelist < VM_NFREELIST; freelist++) vm_freelist_to_flind[freelist]--; /* * Initialize the first_page and free_queues fields of each physical * memory segment. */ #ifdef VM_PHYSSEG_SPARSE npages = 0; #endif for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; #ifdef VM_PHYSSEG_SPARSE seg->first_page = &vm_page_array[npages]; npages += atop(seg->end - seg->start); #else seg->first_page = PHYS_TO_VM_PAGE(seg->start); #endif #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_ISADMA]; KASSERT(flind >= 0, ("vm_phys_init: ISADMA flind < 0")); } else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; KASSERT(flind >= 0, ("vm_phys_init: LOWMEM flind < 0")); } else #endif #ifdef VM_FREELIST_DMA32 if (seg->end <= VM_DMA32_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; KASSERT(flind >= 0, ("vm_phys_init: DMA32 flind < 0")); } else #endif { flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; KASSERT(flind >= 0, ("vm_phys_init: DEFAULT flind < 0")); } seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; } /* * Initialize the free queues. */ for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; for (oind = 0; oind < VM_NFREEORDER; oind++) TAILQ_INIT(&fl[oind].pl); } } } rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* * Split a contiguous, power of two-sized set of physical pages. */ static __inline void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order) { vm_page_t m_buddy; while (oind > order) { oind--; m_buddy = &m[1 << oind]; KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); vm_freelist_add(fl, m_buddy, oind, 0); } } /* * Initialize a physical page and add it to the free lists. */ void vm_phys_add_page(vm_paddr_t pa) { vm_page_t m; struct vm_domain *vmd; vm_cnt.v_page_count++; m = vm_phys_paddr_to_vm_page(pa); m->phys_addr = pa; m->queue = PQ_NONE; m->segind = vm_phys_paddr_to_segind(pa); vmd = vm_phys_domain(m); vmd->vmd_page_count++; vmd->vmd_segs |= 1UL << m->segind; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_add_page: page %p has unexpected order %d", m, m->order)); m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } /* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_pages(int pool, int order) { vm_page_t m; int domain, flind; struct vm_domain_iterator vi; KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { for (flind = 0; flind < vm_nfreelists; flind++) { m = vm_phys_alloc_domain_pages(domain, flind, pool, order); if (m != NULL) return (m); } } vm_policy_iterator_finish(&vi); return (NULL); } /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the * manifest constants VM_FREELIST_*. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order) { vm_page_t m; struct vm_domain_iterator vi; int domain; KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { m = vm_phys_alloc_domain_pages(domain, vm_freelist_to_flind[freelist], pool, order); if (m != NULL) return (m); } vm_policy_iterator_finish(&vi); return (NULL); } static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order) { struct vm_freelist *fl; struct vm_freelist *alt; int oind, pind; vm_page_t m; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { vm_freelist_rem(fl, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } /* * The given pool was empty. Find the largest * contiguous, power-of-two-sized set of pages in any * pool. Transfer these pages to the given pool, and * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } } return (NULL); } /* * Find the vm_page corresponding to the given physical address. */ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (&seg->first_page[atop(pa - seg->start)]); } return (NULL); } vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; m = NULL; tmp.start = pa; tmp.end = 0; rw_rlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); rw_runlock(&vm_phys_fictitious_reg_lock); if (seg == NULL) return (NULL); m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); return (m); } static inline void vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, long page_count, vm_memattr_t memattr) { long i; for (i = 0; i < page_count; i++) { vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); range[i].oflags &= ~VPO_UNMANAGED; range[i].busy_lock = VPB_UNBUSIED; } } int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) { struct vm_phys_fictitious_seg *seg; vm_page_t fp; long page_count; #ifdef VM_PHYSSEG_DENSE long pi, pe; long dpage_count; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; if ((pe - first_page) > vm_page_array_size) { /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Use vm_page_array pages for those that are * inside of the vm_page_array range, and * allocate the remaining ones. */ dpage_count = vm_page_array_size - (pi - first_page); vm_phys_fictitious_init_range(fp, start, dpage_count, memattr); page_count -= dpage_count; start += ptoa(dpage_count); goto alloc; } /* * We can allocate the full range from vm_page_array, * so there's no need to register the range in the tree. */ vm_phys_fictitious_init_range(fp, start, page_count, memattr); return (0); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ fp = &vm_page_array[0]; dpage_count = pe - first_page; vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, memattr); end -= ptoa(dpage_count); page_count -= dpage_count; goto alloc; } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* * Trying to register a fictitious range that expands before * and after vm_page_array. */ return (EINVAL); } else { alloc: #endif fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, M_WAITOK | M_ZERO); #ifdef VM_PHYSSEG_DENSE } #endif vm_phys_fictitious_init_range(fp, start, page_count, memattr); seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); seg->start = start; seg->end = end; seg->first_page = fp; rw_wlock(&vm_phys_fictitious_reg_lock); RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE long pi, pe; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { if ((pe - first_page) <= vm_page_array_size) { /* * This segment was allocated using vm_page_array * only, there's nothing to do since those pages * were never added to the tree. */ return; } /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Calculate how many pages were added to the * tree and free them. */ start = ptoa(first_page + vm_page_array_size); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ end = ptoa(first_page); } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* Since it's not possible to register such a range, panic. */ panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } #endif tmp.start = start; tmp.end = 0; rw_wlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); if (seg->start != start || seg->end != end) { rw_wunlock(&vm_phys_fictitious_reg_lock); panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); free(seg->first_page, M_FICT_PAGES); free(seg, M_FICT_PAGES); } /* * Find the segment containing the given physical address. */ static int vm_phys_paddr_to_segind(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (segind); } panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" , (uintmax_t)pa); } /* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_pages(vm_page_t m, int order) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa; vm_page_t m_buddy; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); KASSERT(m->pool < VM_NFREEPOOL, ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); seg = &vm_phys_segs[m->segind]; if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; m_buddy = &seg->first_page[atop(pa - seg->start)]; if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; vm_freelist_add(fl, m, order, 1); } /* * Free a contiguous, arbitrarily sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_contig(vm_page_t m, u_long npages) { u_int n; int order; /* * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;; npages -= n) { /* * Unsigned "min" is used here so that "order" is assigned * "VM_NFREEORDER - 1" when "m"'s physical address is zero * or the low-order bits of its physical address are zero * because the size of a physical address exceeds the size of * a long. */ order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, VM_NFREEORDER - 1); n = 1 << order; if (npages < n) break; vm_phys_free_pages(m, order); m += n; } /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */ for (; npages > 0; npages -= n) { order = flsl(npages) - 1; n = 1 << order; vm_phys_free_pages(m, order); m += n; } } /* * Scan physical memory between the specified addresses "low" and "high" for a * run of contiguous physical pages that satisfy the specified conditions, and * return the lowest page in the run. The specified "alignment" determines * the alignment of the lowest physical page in the run. If the specified * "boundary" is non-zero, then the run of physical pages cannot span a * physical address that is a multiple of "boundary". * * "npages" must be greater than zero. Both "alignment" and "boundary" must * be a power of two. */ vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options) { vm_paddr_t pa_end; vm_page_t m_end, m_run, m_start; struct vm_phys_seg *seg; int segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); if (low >= high) return (NULL); for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (seg->start >= high) break; if (low >= seg->end) continue; if (low <= seg->start) m_start = seg->first_page; else m_start = &seg->first_page[atop(low - seg->start)]; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) continue; m_end = &seg->first_page[atop(pa_end - seg->start)]; m_run = vm_page_scan_contig(npages, m_start, m_end, alignment, boundary, options); if (m_run != NULL) return (m_run); } return (NULL); } /* * Set the pool for a contiguous, power of two-sized set of physical pages. */ void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) m_tmp->pool = pool; } /* * Search for the given physical page "m" in the free lists. If the search * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa, pa_half; vm_page_t m_set, m_tmp; int order; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) m_set = &seg->first_page[atop(pa - seg->start)]; else return (FALSE); } if (m_set->order < order) return (FALSE); if (m_set->order == VM_NFREEORDER) return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); /* * Next, remove "m_set" from the free lists. Finally, extract * "m" from "m_set" using an iterative algorithm: While "m_set" * is larger than a page, shrink "m_set" by returning the half * of "m_set" that does not contain "m" to the free lists. */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) m_tmp = &seg->first_page[atop(pa_half - seg->start)]; else { m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); } /* * Try to zero one physical page. Used by an idle priority thread. */ boolean_t vm_phys_zero_pages_idle(void) { static struct vm_freelist *fl; static int flind, oind, pind; vm_page_t m, m_tmp; int domain; domain = vm_rr_selectdomain(); fl = vm_phys_free_queues[domain][0][0]; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) { for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { vm_phys_unfree_page(m_tmp); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); pmap_zero_page_idle(m_tmp); m_tmp->flags |= PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m_tmp, 0); vm_page_zero_count++; cnt_prezero++; return (TRUE); } } } oind++; if (oind == VM_NFREEORDER) { oind = 0; pind++; if (pind == VM_NFREEPOOL) { pind = 0; flind++; if (flind == vm_nfreelists) flind = 0; } fl = vm_phys_free_queues[domain][flind][pind]; } } } /* * Allocate a contiguous set of physical pages of the given size * "npages" from the free lists. All of the physical pages must be at * or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross * any physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa_end, pa_start; vm_page_t m_run; struct vm_domain_iterator vi; struct vm_phys_seg *seg; int domain, segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (low >= high) return (NULL); vm_policy_iterator_init(&vi); restartdom: if (vm_domain_iterator_run(&vi, &domain) != 0) { vm_policy_iterator_finish(&vi); return (NULL); } m_run = NULL; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; if (seg->start >= high || seg->domain != domain) continue; if (low >= seg->end) break; if (low <= seg->start) pa_start = seg->start; else pa_start = low; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - pa_start < ptoa(npages)) continue; m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, alignment, boundary); if (m_run != NULL) break; } if (m_run == NULL && !vm_domain_iterator_isdone(&vi)) goto restartdom; vm_policy_iterator_finish(&vi); return (m_run); } /* * Allocate a run of contiguous physical pages from the free list for the * specified segment. */ static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_freelist *fl; vm_paddr_t pa, pa_end, size; vm_page_t m, m_ret; u_long npages_end; int oind, order, pind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); /* Search for a run satisfying the specified conditions. */ size = npages << PAGE_SHIFT; for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*seg->free_queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { /* * Is the size of this allocation request * larger than the largest block size? */ if (order >= VM_NFREEORDER) { /* * Determine if a sufficient number of * subsequent blocks to satisfy the * allocation request are free. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; for (;;) { pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1); if (pa >= pa_end || pa < seg->start || pa >= seg->end) break; m = &seg->first_page[atop(pa - seg->start)]; if (m->order != VM_NFREEORDER - 1) break; } /* If not, go to the next block. */ if (pa < pa_end) continue; } /* * Determine if the blocks are within the * given range, satisfy the given alignment, * and do not cross the given boundary. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; if (pa >= low && pa_end <= high && (pa & (alignment - 1)) == 0 && ((pa ^ (pa_end - 1)) & ~(boundary - 1)) == 0) goto done; } } } return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { fl = (*seg->free_queues)[m->pool]; vm_freelist_rem(fl, m, m->order); } if (m_ret->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); fl = (*seg->free_queues)[m_ret->pool]; vm_phys_split_pages(m_ret, oind, fl, order); /* Return excess pages to the free lists. */ npages_end = roundup2(npages, 1 << imin(oind, order)); if (npages < npages_end) vm_phys_free_contig(&m_ret[npages], npages_end - npages); return (m_ret); } #ifdef DDB /* * Show the number of physical pages in each of the free lists. */ DB_SHOW_COMMAND(freepages, db_show_freepages) { struct vm_freelist *fl; int flind, oind, pind, dom; for (dom = 0; dom < vm_ndomains; dom++) { db_printf("DOMAIN: %d\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { db_printf("FREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf(" | POOL %d", pind); db_printf("\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf("-- -- "); db_printf("--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { db_printf(" %2.2d (%6.6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; db_printf(" | %6.6d", fl[oind].lcnt); } db_printf("\n"); } db_printf("\n"); } db_printf("\n"); } } #endif Index: head/sys/vm/vm_phys.h =================================================================== --- head/sys/vm/vm_phys.h (revision 297747) +++ head/sys/vm/vm_phys.h (revision 297748) @@ -1,126 +1,126 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Physical memory system definitions */ #ifndef _VM_PHYS_H_ #define _VM_PHYS_H_ #ifdef _KERNEL /* Domains must be dense (non-sparse) and zero-based. */ struct mem_affinity { vm_paddr_t start; vm_paddr_t end; int domain; }; struct vm_freelist { struct pglist pl; int lcnt; }; struct vm_phys_seg { vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; int domain; struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER]; }; extern struct mem_affinity *mem_affinity; extern int *mem_locality; extern int vm_ndomains; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs; /* * The following functions are only to be used by the virtual memory system. */ void vm_phys_add_page(vm_paddr_t pa); void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order); vm_page_t vm_phys_alloc_pages(int pool, int order); boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa); void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); int vm_phys_mem_affinity(int f, int t); /* * vm_phys_domain: * * Return the memory domain the page belongs to. */ static inline struct vm_domain * vm_phys_domain(vm_page_t m) { -#if MAXMEMDOM > 1 +#ifdef VM_NUMA_ALLOC int domn, segind; /* XXXKIB try to assert that the page is managed */ segind = m->segind; KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m)); domn = vm_phys_segs[segind].domain; KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); return (&vm_dom[domn]); #else return (&vm_dom[0]); #endif } static inline void vm_phys_freecnt_adj(vm_page_t m, int adj) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); vm_cnt.v_free_count += adj; vm_phys_domain(m)->vmd_free_count += adj; } #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ Index: head/sys/x86/acpica/srat.c =================================================================== --- head/sys/x86/acpica/srat.c (revision 297747) +++ head/sys/x86/acpica/srat.c (revision 297748) @@ -1,506 +1,521 @@ /*- * Copyright (c) 2010 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_vm.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if MAXMEMDOM > 1 struct cpu_info { int enabled:1; int has_memory:1; int domain; } cpus[MAX_APIC_ID + 1]; struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; int num_mem; static ACPI_TABLE_SRAT *srat; static vm_paddr_t srat_physaddr; -static int vm_domains[VM_PHYSSEG_MAX]; +static int domain_pxm[MAXMEMDOM]; +static int ndomain; static ACPI_TABLE_SLIT *slit; static vm_paddr_t slit_physaddr; static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; static void srat_walk_table(acpi_subtable_handler *handler, void *arg); /* * SLIT parsing. */ static void slit_parse_table(ACPI_TABLE_SLIT *s) { int i, j; int i_domain, j_domain; int offset = 0; uint8_t e; /* * This maps the SLIT data into the VM-domain centric view. * There may be sparse entries in the PXM namespace, so * remap them to a VM-domain ID and if it doesn't exist, * skip it. * * It should result in a packed 2d array of VM-domain * locality information entries. */ if (bootverbose) printf("SLIT.Localities: %d\n", (int) s->LocalityCount); for (i = 0; i < s->LocalityCount; i++) { i_domain = acpi_map_pxm_to_vm_domainid(i); if (i_domain < 0) continue; if (bootverbose) printf("%d: ", i); for (j = 0; j < s->LocalityCount; j++) { j_domain = acpi_map_pxm_to_vm_domainid(j); if (j_domain < 0) continue; e = s->Entry[i * s->LocalityCount + j]; if (bootverbose) printf("%d ", (int) e); /* 255 == "no locality information" */ if (e == 255) vm_locality_table[offset] = -1; else vm_locality_table[offset] = e; offset++; } if (bootverbose) printf("\n"); } } /* * Look for an ACPI System Locality Distance Information Table ("SLIT") */ static int parse_slit(void) { if (resource_disabled("slit", 0)) { return (-1); } slit_physaddr = acpi_find_table(ACPI_SIG_SLIT); if (slit_physaddr == 0) { return (-1); } /* * Make a pass over the table to populate the cpus[] and * mem_info[] tables. */ slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT); slit_parse_table(slit); acpi_unmap_table(slit); slit = NULL; +#ifdef VM_NUMA_ALLOC /* Tell the VM about it! */ mem_locality = vm_locality_table; +#endif return (0); } /* * SRAT parsing. */ /* * Returns true if a memory range overlaps with at least one range in * phys_avail[]. */ static int overlaps_phys_avail(vm_paddr_t start, vm_paddr_t end) { int i; for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] < start) continue; if (phys_avail[i] < end) return (1); break; } return (0); } static void srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_SRAT_CPU_AFFINITY *cpu; ACPI_SRAT_X2APIC_CPU_AFFINITY *x2apic; ACPI_SRAT_MEM_AFFINITY *mem; int domain, i, slot; switch (entry->Type) { case ACPI_SRAT_TYPE_CPU_AFFINITY: cpu = (ACPI_SRAT_CPU_AFFINITY *)entry; domain = cpu->ProximityDomainLo | cpu->ProximityDomainHi[0] << 8 | cpu->ProximityDomainHi[1] << 16 | cpu->ProximityDomainHi[2] << 24; if (bootverbose) printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", cpu->ApicId, domain, (cpu->Flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled"); if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) break; KASSERT(!cpus[cpu->ApicId].enabled, ("Duplicate local APIC ID %u", cpu->ApicId)); cpus[cpu->ApicId].domain = domain; cpus[cpu->ApicId].enabled = 1; break; case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: x2apic = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)entry; if (bootverbose) printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", x2apic->ApicId, x2apic->ProximityDomain, (x2apic->Flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled"); if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED)) break; KASSERT(!cpus[x2apic->ApicId].enabled, ("Duplicate local APIC ID %u", x2apic->ApicId)); cpus[x2apic->ApicId].domain = x2apic->ProximityDomain; cpus[x2apic->ApicId].enabled = 1; break; case ACPI_SRAT_TYPE_MEMORY_AFFINITY: mem = (ACPI_SRAT_MEM_AFFINITY *)entry; if (bootverbose) printf( "SRAT: Found memory domain %d addr %jx len %jx: %s\n", mem->ProximityDomain, (uintmax_t)mem->BaseAddress, (uintmax_t)mem->Length, (mem->Flags & ACPI_SRAT_MEM_ENABLED) ? "enabled" : "disabled"); if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) break; if (!overlaps_phys_avail(mem->BaseAddress, mem->BaseAddress + mem->Length)) { printf("SRAT: Ignoring memory at addr %jx\n", (uintmax_t)mem->BaseAddress); break; } if (num_mem == VM_PHYSSEG_MAX) { printf("SRAT: Too many memory regions\n"); *(int *)arg = ENXIO; break; } slot = num_mem; for (i = 0; i < num_mem; i++) { if (mem_info[i].end <= mem->BaseAddress) continue; if (mem_info[i].start < (mem->BaseAddress + mem->Length)) { printf("SRAT: Overlapping memory entries\n"); *(int *)arg = ENXIO; return; } slot = i; } for (i = num_mem; i > slot; i--) mem_info[i] = mem_info[i - 1]; mem_info[slot].start = mem->BaseAddress; mem_info[slot].end = mem->BaseAddress + mem->Length; mem_info[slot].domain = mem->ProximityDomain; num_mem++; break; } } /* * Ensure each memory domain has at least one CPU and that each CPU * has at least one memory domain. */ static int check_domains(void) { int found, i, j; for (i = 0; i < num_mem; i++) { found = 0; for (j = 0; j <= MAX_APIC_ID; j++) if (cpus[j].enabled && cpus[j].domain == mem_info[i].domain) { cpus[j].has_memory = 1; found++; } if (!found) { printf("SRAT: No CPU found for memory domain %d\n", mem_info[i].domain); return (ENXIO); } } for (i = 0; i <= MAX_APIC_ID; i++) if (cpus[i].enabled && !cpus[i].has_memory) { printf("SRAT: No memory found for CPU %d\n", i); return (ENXIO); } return (0); } /* * Check that the SRAT memory regions cover all of the regions in * phys_avail[]. */ static int check_phys_avail(void) { vm_paddr_t address; int i, j; /* j is the current offset into phys_avail[]. */ address = phys_avail[0]; j = 0; for (i = 0; i < num_mem; i++) { /* * Consume as many phys_avail[] entries as fit in this * region. */ while (address >= mem_info[i].start && address <= mem_info[i].end) { /* * If we cover the rest of this phys_avail[] entry, * advance to the next entry. */ if (phys_avail[j + 1] <= mem_info[i].end) { j += 2; if (phys_avail[j] == 0 && phys_avail[j + 1] == 0) { return (0); } address = phys_avail[j]; } else address = mem_info[i].end + 1; } } printf("SRAT: No memory region found for %jx - %jx\n", (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]); return (ENXIO); } /* * Renumber the memory domains to be compact and zero-based if not * already. Returns an error if there are too many domains. */ static int renumber_domains(void) { int i, j, slot; /* Enumerate all the domains. */ - vm_ndomains = 0; + ndomain = 0; for (i = 0; i < num_mem; i++) { /* See if this domain is already known. */ - for (j = 0; j < vm_ndomains; j++) { - if (vm_domains[j] >= mem_info[i].domain) + for (j = 0; j < ndomain; j++) { + if (domain_pxm[j] >= mem_info[i].domain) break; } - if (j < vm_ndomains && vm_domains[j] == mem_info[i].domain) + if (j < ndomain && domain_pxm[j] == mem_info[i].domain) continue; /* Insert the new domain at slot 'j'. */ slot = j; - for (j = vm_ndomains; j > slot; j--) - vm_domains[j] = vm_domains[j - 1]; - vm_domains[slot] = mem_info[i].domain; - vm_ndomains++; - if (vm_ndomains > MAXMEMDOM) { - vm_ndomains = 1; + for (j = ndomain; j > slot; j--) + domain_pxm[j] = domain_pxm[j - 1]; + domain_pxm[slot] = mem_info[i].domain; + ndomain++; + if (ndomain > MAXMEMDOM) { + ndomain = 1; printf("SRAT: Too many memory domains\n"); return (EFBIG); } } - /* Renumber each domain to its index in the sorted 'domains' list. */ - for (i = 0; i < vm_ndomains; i++) { + /* Renumber each domain to its index in the sorted 'domain_pxm' list. */ + for (i = 0; i < ndomain; i++) { /* * If the domain is already the right value, no need * to renumber. */ - if (vm_domains[i] == i) + if (domain_pxm[i] == i) continue; /* Walk the cpu[] and mem_info[] arrays to renumber. */ for (j = 0; j < num_mem; j++) - if (mem_info[j].domain == vm_domains[i]) + if (mem_info[j].domain == domain_pxm[i]) mem_info[j].domain = i; for (j = 0; j <= MAX_APIC_ID; j++) - if (cpus[j].enabled && cpus[j].domain == vm_domains[i]) + if (cpus[j].enabled && cpus[j].domain == domain_pxm[i]) cpus[j].domain = i; } - KASSERT(vm_ndomains > 0, - ("renumber_domains: invalid final vm_ndomains setup")); return (0); } /* * Look for an ACPI System Resource Affinity Table ("SRAT") */ static int parse_srat(void) { int error; if (resource_disabled("srat", 0)) return (-1); srat_physaddr = acpi_find_table(ACPI_SIG_SRAT); if (srat_physaddr == 0) return (-1); /* * Make a pass over the table to populate the cpus[] and * mem_info[] tables. */ srat = acpi_map_table(srat_physaddr, ACPI_SIG_SRAT); error = 0; srat_walk_table(srat_parse_entry, &error); acpi_unmap_table(srat); srat = NULL; if (error || check_domains() != 0 || check_phys_avail() != 0 || renumber_domains() != 0) { srat_physaddr = 0; return (-1); } +#ifdef VM_NUMA_ALLOC /* Point vm_phys at our memory affinity table. */ + vm_ndomains = ndomain; mem_affinity = mem_info; +#endif return (0); } static void init_mem_locality(void) { int i; /* * For now, assume -1 == "no locality information for * this pairing. */ for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++) vm_locality_table[i] = -1; } static void parse_acpi_tables(void *dummy) { if (parse_srat() < 0) return; init_mem_locality(); (void) parse_slit(); } SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables, NULL); static void srat_walk_table(acpi_subtable_handler *handler, void *arg) { acpi_walk_subtables(srat + 1, (char *)srat + srat->Header.Length, handler, arg); } /* * Setup per-CPU domain IDs. */ static void srat_set_cpus(void *dummy) { struct cpu_info *cpu; struct pcpu *pc; u_int i; if (srat_physaddr == 0) return; for (i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); cpu = &cpus[pc->pc_apic_id]; if (!cpu->enabled) panic("SRAT: CPU with APIC ID %u is not known", pc->pc_apic_id); pc->pc_domain = cpu->domain; CPU_SET(i, &cpuset_domain[cpu->domain]); if (bootverbose) printf("SRAT: CPU %u has memory domain %d\n", i, cpu->domain); } } SYSINIT(srat_set_cpus, SI_SUB_CPU, SI_ORDER_ANY, srat_set_cpus, NULL); /* * Map a _PXM value to a VM domain ID. * * Returns the domain ID, or -1 if no domain ID was found. */ int acpi_map_pxm_to_vm_domainid(int pxm) { int i; - for (i = 0; i < vm_ndomains; i++) { - if (vm_domains[i] == pxm) + for (i = 0; i < ndomain; i++) { + if (domain_pxm[i] == pxm) return (i); } + + return (-1); +} + +#else /* MAXMEMDOM == 1 */ + +int +acpi_map_pxm_to_vm_domainid(int pxm) +{ return (-1); } #endif /* MAXMEMDOM > 1 */