Index: stable/10/sys/amd64/conf/GENERIC
===================================================================
--- stable/10/sys/amd64/conf/GENERIC	(revision 284664)
+++ stable/10/sys/amd64/conf/GENERIC	(revision 284665)
@@ -1,361 +1,364 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/amd64
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		HAMMER
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 makeoptions	WITH_CTF=1		# Run ctfconvert(1) for DTrace support
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# New Network Filesystem Client
 options 	NFSD			# New Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	PROCDESC		# Support for process descriptors
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_FRAME		# Ensure frames are compiled in
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
+options 	RACCT			# Resource accounting framework
+options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
+options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 
 # CPU frequency control
 device		cpufreq
 
 # Bus support.
 device		acpi
 options 	ACPI_DMAR
 device		pci
 
 # Floppy drives
 device		fdc
 
 # ATA controllers
 device		ahci			# AHCI-compatible SATA controllers
 device		ata			# Legacy ATA/SATA controllers
 options 	ATA_STATIC_ID		# Static device numbering
 device		mvs			# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis			# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahc			# AHA2940 and onboard AIC7xxx devices
 options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~128k to driver.
 device		ahd			# AHA39320/29320 and onboard AIC79xx devices
 options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~215k to driver.
 device		esp			# AMD Am53C974 (Tekram DC-390(T))
 device		hptiop			# Highpoint RocketRaid 3xxx series
 device		isp			# Qlogic family
 #device		ispfw			# Firmware for QLogic HBAs- normally a module
 device		mpt			# LSI-Logic MPT-Fusion
 device		mps			# LSI-Logic MPT-Fusion 2
 device		mpr			# LSI-Logic MPT-Fusion 3
 #device		ncr			# NCR/Symbios Logic
 device		sym			# NCR/Symbios Logic (newer chipsets + those of `ncr')
 device		trm			# Tekram DC395U/UW/F DC315U adapters
 
 device		adv			# Advansys SCSI adapters
 device		adw			# Advansys wide SCSI adapters
 device		aic			# Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
 device		bt			# Buslogic/Mylex MultiMaster SCSI adapters
 device		isci			# Intel C600 SAS controller
 
 # ATA/SCSI peripherals
 device		scbus			# SCSI bus (required for ATA/SCSI)
 device		ch			# SCSI media changers
 device		da			# Direct Access (disks)
 device		sa			# Sequential Access (tape etc)
 device		cd			# CD
 device		pass			# Passthrough device (direct ATA/SCSI access)
 device		ses			# Enclosure Services (SES and SAF-TE)
 #device		ctl			# CAM Target Layer
 
 # RAID controllers interfaced to the SCSI subsystem
 device		amr			# AMI MegaRAID
 device		arcmsr			# Areca SATA II RAID
 #XXX it is not 64-bit clean, -scottl
 #device		asr			# DPT SmartRAID V, VI and Adaptec SCSI RAID
 device		ciss			# Compaq Smart RAID 5*
 device		dpt			# DPT Smartcache III, IV - See NOTES for options
 device		hptmv			# Highpoint RocketRAID 182x
 device		hptnr			# Highpoint DC7280, R750
 device		hptrr			# Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
 device		hpt27xx			# Highpoint RocketRAID 27xx
 device		iir			# Intel Integrated RAID
 device		ips			# IBM (Adaptec) ServeRAID
 device		mly			# Mylex AcceleRAID/eXtremeRAID
 device		twa			# 3ware 9000 series PATA/SATA RAID
 device		tws			# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # RAID controllers
 device		aac			# Adaptec FSA RAID
 device		aacp			# SCSI passthrough for aac (requires CAM)
 device		aacraid			# Adaptec by PMC RAID
 device		ida			# Compaq Smart RAID
 device		mfi			# LSI MegaRAID SAS
 device		mlx			# Mylex DAC960 family
 device		mrsas			# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 #XXX pointer/int warnings
 #device		pst			# Promise Supertrak SX6000
 device		twe			# 3ware ATA RAID
 
 # NVM Express (NVMe) support
 device		nvme			# base NVMe driver
 device		nvd			# expose NVMe namespaces as disks, depends on nvme
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc			# AT keyboard controller
 device		atkbd			# AT keyboard
 device		psm			# PS/2 mouse
 
 device		kbdmux			# keyboard multiplexer
 
 device		vga			# VGA video card driver
 options 	VESA			# Add support for VESA BIOS Extensions (VBE)
 
 device		splash			# Splash screen and screen saver support
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 options 	SC_PIXEL_MODE		# add support for the raster text mode
 
 # vt is the new video console driver
 device		vt
 device		vt_vga
 device		vt_efifb
 
 device		agp			# support several AGP chipsets
 
 # PCCARD (PCMCIA) support
 # PCMCIA and cardbus bridge support
 device		cbb			# cardbus (yenta) bridge
 device		pccard			# PC Card (16-bit) bus
 device		cardbus			# CardBus (32-bit) bus
 
 # Serial (COM) ports
 device		uart			# Generic UART driver
 
 # Parallel port
 device		ppc
 device		ppbus			# Parallel port bus (required)
 device		lpt			# Printer
 device		ppi			# Parallel port interface device
 #device		vpo			# Requires scbus and da
 
 device		puc			# Multi I/O cards and multi-channel UARTs
 
 # PCI Ethernet NICs.
 device		bxe			# Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE
 device		de			# DEC/Intel DC21x4x (``Tulip'')
 device		em			# Intel PRO/1000 Gigabit Ethernet Family
 device		igb			# Intel PRO/1000 PCIE Server Gigabit Family
 device		ix			# Intel PRO/10GbE PCIE PF Ethernet
 device		ixv			# Intel PRO/10GbE PCIE VF Ethernet
 device		ixl			# Intel XL710 40Gbe PCIE Ethernet
 device		ixlv			# Intel XL710 40Gbe VF PCIE Ethernet
 device		le			# AMD Am7900 LANCE and Am79C9xx PCnet
 device		ti			# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp			# 3Com 3cR990 (``Typhoon'')
 device		vx			# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
 device		miibus			# MII bus support
 device		ae			# Attansic/Atheros L2 FastEthernet
 device		age			# Attansic/Atheros L1 Gigabit Ethernet
 device		alc			# Atheros AR8131/AR8132 Ethernet
 device		ale			# Atheros AR8121/AR8113/AR8114 Ethernet
 device		bce			# Broadcom BCM5706/BCM5708 Gigabit Ethernet
 device		bfe			# Broadcom BCM440x 10/100 Ethernet
 device		bge			# Broadcom BCM570xx Gigabit Ethernet
 device		cas			# Sun Cassini/Cassini+ and NS DP83065 Saturn
 device		dc			# DEC/Intel 21143 and various workalikes
 device		et			# Agere ET1310 10/100/Gigabit Ethernet
 device		fxp			# Intel EtherExpress PRO/100B (82557, 82558)
 device		gem			# Sun GEM/Sun ERI/Apple GMAC
 device		hme			# Sun HME (Happy Meal Ethernet)
 device		jme			# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
 device		lge			# Level 1 LXT1001 gigabit Ethernet
 device		msk			# Marvell/SysKonnect Yukon II Gigabit Ethernet
 device		nfe			# nVidia nForce MCP on-board Ethernet
 device		nge			# NatSemi DP83820 gigabit Ethernet
 #device		nve			# nVidia nForce MCP on-board Ethernet Networking
 device		pcn			# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re			# RealTek 8139C+/8169/8169S/8110S
 device		rl			# RealTek 8129/8139
 device		sf			# Adaptec AIC-6915 (``Starfire'')
 device		sge			# Silicon Integrated Systems SiS190/191
 device		sis			# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk			# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste			# Sundance ST201 (D-Link DFE-550TX)
 device		stge			# Sundance/Tamarack TC9021 gigabit Ethernet
 device		tl			# Texas Instruments ThunderLAN
 device		tx			# SMC EtherPower II (83c170 ``EPIC'')
 device		vge			# VIA VT612x gigabit Ethernet
 device		vr			# VIA Rhine, Rhine II
 device		wb			# Winbond W89C840F
 device		xl			# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # ISA Ethernet NICs.  pccard NICs included.
 device		cs			# Crystal Semiconductor CS89x0 NIC
 # 'device ed' requires 'device miibus'
 device		ed			# NE[12]000, SMC Ultra, 3c503, DS8390 cards
 device		ex			# Intel EtherExpress Pro/10 and Pro/10+
 device		ep			# Etherlink III based cards
 device		fe			# Fujitsu MB8696x based cards
 device		sn			# SMC's 9000 series of Ethernet chips
 device		xe			# Xircom pccard Ethernet
 
 # Wireless NIC cards
 device		wlan			# 802.11 support
 options 	IEEE80211_DEBUG		# enable debug msgs
 options 	IEEE80211_AMPDU_AGE	# age frames in AMPDU reorder q's
 options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
 device		wlan_wep		# 802.11 WEP support
 device		wlan_ccmp		# 802.11 CCMP support
 device		wlan_tkip		# 802.11 TKIP support
 device		wlan_amrr		# AMRR transmit rate control algorithm
 device		an			# Aironet 4500/4800 802.11 wireless NICs.
 device		ath			# Atheros NICs
 device		ath_pci			# Atheros pci/cardbus glue
 device		ath_hal			# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
 options 	AH_AR5416_INTERRUPT_MITIGATION # AR5416 interrupt mitigation
 options 	ATH_ENABLE_11N		# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample		# SampleRate tx rate control for ath
 #device		bwi			# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn			# Broadcom BCM43xx wireless NICs.
 device		ipw			# Intel 2100 wireless NICs.
 device		iwi			# Intel 2200BG/2225BG/2915ABG wireless NICs.
 device		iwn			# Intel 4965/1000/5000/6000 wireless NICs.
 device		malo			# Marvell Libertas wireless NICs.
 device		mwl			# Marvell 88W8363 802.11n wireless NICs.
 device		ral			# Ralink Technology RT2500 wireless NICs.
 device		wi			# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
 device		wpi			# Intel 3945ABG wireless NICs.
 
 # Pseudo devices.
 device		loop			# Network loopback
 device		random			# Entropy device
 device		padlock_rng		# VIA Padlock RNG
 device		rdrand_rng		# Intel Bull Mountain RNG
 device		ether			# Ethernet support
 device		vlan			# 802.1Q VLAN support
 device		tun			# Packet tunnel.
 device		md			# Memory "disks"
 device		gif			# IPv6 and IPv4 tunneling
 device		faith			# IPv6-to-IPv4 relaying (translation)
 device		firmware		# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf			# Berkeley packet filter
 
 # USB support
 options 	USB_DEBUG		# enable debug msgs
 device		uhci			# UHCI PCI->USB interface
 device		ohci			# OHCI PCI->USB interface
 device		ehci			# EHCI PCI->USB interface (USB 2.0)
 device		xhci			# XHCI PCI->USB interface (USB 3.0)
 device		usb			# USB Bus (required)
 device		ukbd			# Keyboard
 device		umass			# Disks/Mass storage - Requires scbus and da
 
 # Sound support
 device		sound			# Generic sound driver (required)
 device		snd_cmi			# CMedia CMI8338/CMI8738
 device		snd_csa			# Crystal Semiconductor CS461x/428x
 device		snd_emu10kx		# Creative SoundBlaster Live! and Audigy
 device		snd_es137x		# Ensoniq AudioPCI ES137x
 device		snd_hda			# Intel High Definition Audio
 device		snd_ich			# Intel, NVidia and other ICH AC'97 Audio
 device		snd_via8233		# VIA VT8233x Audio
 
 # MMC/SD
 device		mmc			# MMC/SD bus
 device		mmcsd			# MMC/SD memory card
 device		sdhci			# Generic PCI SD Host Controller
 
 # VirtIO support
 device		virtio			# Generic VirtIO bus (required)
 device		virtio_pci		# VirtIO PCI device
 device		vtnet			# VirtIO Ethernet device
 device		virtio_blk		# VirtIO Block device
 device		virtio_scsi		# VirtIO SCSI device
 device		virtio_balloon		# VirtIO Memory Balloon device
 
 # HyperV drivers and enchancement support
 # NOTE: HYPERV depends on hyperv.  They must be added or removed together.
 options 	HYPERV			# Hyper-V kernel infrastructure
 device		hyperv			# HyperV drivers 
 
 # Xen HVM Guest Optimizations
 # NOTE: XENHVM depends on xenpci.  They must be added or removed together.
 options 	XENHVM			# Xen HVM kernel infrastructure
 device		xenpci			# Xen HVM Hypervisor services driver
 
 # VMware support
 device		vmx			# VMware VMXNET3 Ethernet
Index: stable/10/sys/conf/options
===================================================================
--- stable/10/sys/conf/options	(revision 284664)
+++ stable/10/sys/conf/options	(revision 284665)
@@ -1,939 +1,940 @@
 # $FreeBSD$
 #
 #        On the handling of kernel options
 #
 # All kernel options should be listed in NOTES, with suitable
 # descriptions.  Negative options (options that make some code not
 # compile) should be commented out; LINT (generated from NOTES) should
 # compile as much code as possible.  Try to structure option-using
 # code so that a single option only switch code on, or only switch
 # code off, to make it possible to have a full compile-test.  If
 # necessary, you can check for COMPILING_LINT to get maximum code
 # coverage.
 #
 # All new options shall also be listed in either "conf/options" or
 # "conf/options.<machine>".  Options that affect a single source-file
 # <xxx>.[c|s] should be directed into "opt_<xxx>.h", while options
 # that affect multiple files should either go in "opt_global.h" if
 # this is a kernel-wide option (used just about everywhere), or in
 # "opt_<option-name-in-lower-case>.h" if it affects only some files.
 # Note that the effect of listing only an option without a
 # header-file-name in conf/options (and cousins) is that the last
 # convention is followed.
 #
 # This handling scheme is not yet fully implemented.
 #
 #
 # Format of this file:
 # Option name	filename
 #
 # If filename is missing, the default is
 # opt_<name-of-option-in-lower-case>.h
 
 AAC_DEBUG		opt_aac.h
 AACRAID_DEBUG		opt_aacraid.h
 AHC_ALLOW_MEMIO		opt_aic7xxx.h
 AHC_TMODE_ENABLE	opt_aic7xxx.h
 AHC_DUMP_EEPROM		opt_aic7xxx.h
 AHC_DEBUG		opt_aic7xxx.h
 AHC_DEBUG_OPTS		opt_aic7xxx.h
 AHC_REG_PRETTY_PRINT	opt_aic7xxx.h
 AHD_DEBUG		opt_aic79xx.h
 AHD_DEBUG_OPTS		opt_aic79xx.h
 AHD_TMODE_ENABLE	opt_aic79xx.h	
 AHD_REG_PRETTY_PRINT	opt_aic79xx.h
 ADW_ALLOW_MEMIO		opt_adw.h
 
 TWA_DEBUG		opt_twa.h
 TWA_FLASH_FIRMWARE	opt_twa.h
 
 # Debugging options.
 ALT_BREAK_TO_DEBUGGER	opt_kdb.h
 BREAK_TO_DEBUGGER	opt_kdb.h
 DDB
 DDB_BUFR_SIZE	opt_ddb.h
 DDB_CAPTURE_DEFAULTBUFSIZE	opt_ddb.h
 DDB_CAPTURE_MAXBUFSIZE	opt_ddb.h
 DDB_CTF		opt_ddb.h
 DDB_NUMSYM	opt_ddb.h
 GDB
 KDB		opt_global.h
 KDB_TRACE	opt_kdb.h
 KDB_UNATTENDED	opt_kdb.h
 KLD_DEBUG	opt_kld.h
 SYSCTL_DEBUG	opt_sysctl.h
 EARLY_PRINTF	opt_global.h
 TEXTDUMP_PREFERRED	opt_ddb.h
 TEXTDUMP_VERBOSE	opt_ddb.h
 
 # Miscellaneous options.
 ADAPTIVE_LOCKMGRS
 ALQ
 ALTERA_SDCARD_FAST_SIM	opt_altera_sdcard.h
 ATSE_CFI_HACK	opt_cfi.h
 AUDIT		opt_global.h
 BOOTHOWTO	opt_global.h
 BOOTVERBOSE	opt_global.h
 CALLOUT_PROFILING
 CAPABILITIES	opt_capsicum.h
 CAPABILITY_MODE	opt_capsicum.h
 COMPAT_43	opt_compat.h
 COMPAT_43TTY	opt_compat.h
 COMPAT_FREEBSD4	opt_compat.h
 COMPAT_FREEBSD5	opt_compat.h
 COMPAT_FREEBSD6	opt_compat.h
 COMPAT_FREEBSD7	opt_compat.h
 COMPILING_LINT	opt_global.h
 COMPRESS_USER_CORES opt_core.h
 CY_PCI_FASTINTR
 DEADLKRES	opt_watchdog.h
 DIRECTIO
 FILEMON		opt_dontuse.h
 FFCLOCK
 FULL_PREEMPTION	opt_sched.h
 IPI_PREEMPTION	opt_sched.h
 GEOM_AES	opt_geom.h
 GEOM_BDE	opt_geom.h
 GEOM_BSD	opt_geom.h
 GEOM_CACHE	opt_geom.h
 GEOM_CONCAT	opt_geom.h
 GEOM_ELI	opt_geom.h
 GEOM_FOX	opt_geom.h
 GEOM_GATE	opt_geom.h
 GEOM_JOURNAL	opt_geom.h
 GEOM_LABEL	opt_geom.h
 GEOM_LINUX_LVM	opt_geom.h
 GEOM_MBR	opt_geom.h
 GEOM_MIRROR	opt_geom.h
 GEOM_MULTIPATH	opt_geom.h
 GEOM_NOP	opt_geom.h
 GEOM_PART_APM	opt_geom.h
 GEOM_PART_BSD	opt_geom.h
 GEOM_PART_BSD64	opt_geom.h
 GEOM_PART_EBR	opt_geom.h
 GEOM_PART_EBR_COMPAT	opt_geom.h
 GEOM_PART_GPT	opt_geom.h
 GEOM_PART_LDM	opt_geom.h
 GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
 GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
 GEOM_SUNLABEL	opt_geom.h
 GEOM_UNCOMPRESS	opt_geom.h
 GEOM_UZIP	opt_geom.h
 GEOM_VINUM	opt_geom.h
 GEOM_VIRSTOR	opt_geom.h
 GEOM_VOL	opt_geom.h
 GEOM_ZERO	opt_geom.h
 KDTRACE_HOOKS	opt_kdtrace.h
 KSTACK_MAX_PAGES
 KSTACK_PAGES
 KSTACK_USAGE_PROF
 KTRACE
 KTRACE_REQUEST_POOL	opt_ktrace.h
 LIBICONV
 MAC		opt_global.h
 MAC_BIBA	opt_dontuse.h
 MAC_BSDEXTENDED	opt_dontuse.h
 MAC_IFOFF	opt_dontuse.h
 MAC_LOMAC	opt_dontuse.h
 MAC_MLS		opt_dontuse.h
 MAC_NONE	opt_dontuse.h
 MAC_PARTITION	opt_dontuse.h
 MAC_PORTACL	opt_dontuse.h
 MAC_SEEOTHERUIDS	opt_dontuse.h
 MAC_STATIC	opt_mac.h
 MAC_STUB	opt_dontuse.h
 MAC_TEST	opt_dontuse.h
 MD_ROOT		opt_md.h
 MD_ROOT_FSTYPE	opt_md.h
 MD_ROOT_SIZE	opt_md.h
 MFI_DEBUG	opt_mfi.h
 MFI_DECODE_LOG	opt_mfi.h
 MPROF_BUFFERS	opt_mprof.h
 MPROF_HASH_SIZE	opt_mprof.h
 NEW_PCIB	opt_global.h
 NO_ADAPTIVE_MUTEXES	opt_adaptive_mutexes.h
 NO_ADAPTIVE_RWLOCKS
 NO_ADAPTIVE_SX
 NO_EVENTTIMERS		opt_timer.h
 NO_SYSCTL_DESCR	opt_global.h
 NSWBUF_MIN	opt_swap.h
 MBUF_PACKET_ZONE_DISABLE	opt_global.h
 PANIC_REBOOT_WAIT_TIME	opt_panic.h
 PPC_DEBUG	opt_ppc.h
 PPC_PROBE_CHIPSET	opt_ppc.h
 PPS_SYNC	opt_ntp.h
 PREEMPTION	opt_sched.h
 PROCDESC	opt_procdesc.h
 QUOTA
 SCHED_4BSD	opt_sched.h
 SCHED_STATS	opt_sched.h
 SCHED_ULE	opt_sched.h
 SLEEPQUEUE_PROFILING
 SLHCI_DEBUG	opt_slhci.h
 SPX_HACK
 STACK		opt_stack.h
 SUIDDIR
 MSGMNB		opt_sysvipc.h
 MSGMNI		opt_sysvipc.h
 MSGSEG		opt_sysvipc.h
 MSGSSZ		opt_sysvipc.h
 MSGTQL		opt_sysvipc.h
 SEMMNI		opt_sysvipc.h
 SEMMNS		opt_sysvipc.h
 SEMMNU		opt_sysvipc.h
 SEMMSL		opt_sysvipc.h
 SEMOPM		opt_sysvipc.h
 SEMUME		opt_sysvipc.h
 SHMALL		opt_sysvipc.h
 SHMMAX		opt_sysvipc.h
 SHMMAXPGS	opt_sysvipc.h
 SHMMIN		opt_sysvipc.h
 SHMMNI		opt_sysvipc.h
 SHMSEG		opt_sysvipc.h
 SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 VFS_AIO
 VERBOSE_SYSINIT	opt_global.h
 WLCACHE		opt_wavelan.h
 WLDEBUG		opt_wavelan.h
 
 # POSIX kernel options
 P1003_1B_MQUEUE			opt_posix.h
 P1003_1B_SEMAPHORES		opt_posix.h
 _KPOSIX_PRIORITY_SCHEDULING	opt_posix.h
 
 # Do we want the config file compiled into the kernel?
 INCLUDE_CONFIG_FILE	opt_config.h
 
 # Options for static filesystems.  These should only be used at config
 # time, since the corresponding lkms cannot work if there are any static
 # dependencies.  Unusability is enforced by hiding the defines for the
 # options in a never-included header.
 AUTOFS		opt_dontuse.h
 CD9660		opt_dontuse.h
 EXT2FS		opt_dontuse.h
 FDESCFS		opt_dontuse.h
 FFS		opt_dontuse.h
 FUSE		opt_dontuse.h
 MSDOSFS		opt_dontuse.h
 NANDFS		opt_dontuse.h
 NULLFS		opt_dontuse.h
 PROCFS		opt_dontuse.h
 PSEUDOFS	opt_dontuse.h
 REISERFS	opt_dontuse.h
 SMBFS		opt_dontuse.h
 TMPFS		opt_dontuse.h
 UDF		opt_dontuse.h
 UNIONFS		opt_dontuse.h
 ZFS		opt_dontuse.h
 
 # Pseudofs debugging
 PSEUDOFS_TRACE	opt_pseudofs.h
 
 # In-kernel GSS-API
 KGSSAPI		opt_kgssapi.h
 KGSSAPI_DEBUG	opt_kgssapi.h
 
 # These static filesystems have one slightly bogus static dependency in
 # sys/i386/i386/autoconf.c.  If any of these filesystems are
 # statically compiled into the kernel, code for mounting them as root
 # filesystems will be enabled - but look below.
 NFSCLIENT	opt_nfs.h
 NFSSERVER	opt_nfs.h
 
 # Use these options to compile the experimental nfs client and/or
 # server that supports NFSv4 into a kernel.
 # NFSCL - client
 # NFSD - server
 NFSCL		opt_nfs.h
 NFSD		opt_nfs.h
 
 # filesystems and libiconv bridge
 CD9660_ICONV	opt_dontuse.h
 MSDOSFS_ICONV	opt_dontuse.h
 UDF_ICONV	opt_dontuse.h
 
 # If you are following the conditions in the copyright,
 # you can enable soft-updates which will speed up a lot of thigs
 # and make the system safer from crashes at the same time.
 # otherwise a STUB module will be compiled in.
 SOFTUPDATES	opt_ffs.h
 
 # On small, embedded systems, it can be useful to turn off support for
 # snapshots.  It saves about 30-40k for a feature that would be lightly
 # used, if it is used at all.
 NO_FFS_SNAPSHOT	opt_ffs.h
 
 # Enabling this option turns on support for Access Control Lists in UFS,
 # which can be used to support high security configurations.  Depends on
 # UFS_EXTATTR.
 UFS_ACL		opt_ufs.h
 
 # Enabling this option turns on support for extended attributes in UFS-based
 # filesystems, which can be used to support high security configurations
 # as well as new filesystem features.
 UFS_EXTATTR	opt_ufs.h
 UFS_EXTATTR_AUTOSTART	opt_ufs.h
 
 # Enable fast hash lookups for large directories on UFS-based filesystems.
 UFS_DIRHASH	opt_ufs.h
 
 # Enable gjournal-based UFS journal.
 UFS_GJOURNAL	opt_ufs.h
 
 # The below sentence is not in English, and neither is this one.
 # We plan to remove the static dependences above, with a
 # <filesystem>_ROOT option to control if it usable as root.  This list
 # allows these options to be present in config files already (though
 # they won't make any difference yet).
 NFS_ROOT	opt_nfsroot.h
 
 # SMB/CIFS requester
 NETSMB		opt_netsmb.h
 
 # Options used only in subr_param.c.
 HZ		opt_param.h
 MAXFILES	opt_param.h
 NBUF		opt_param.h
 NSFBUFS		opt_param.h
 VM_BCACHE_SIZE_MAX	opt_param.h
 VM_SWZONE_SIZE_MAX	opt_param.h
 MAXUSERS
 DFLDSIZ		opt_param.h
 MAXDSIZ		opt_param.h
 MAXSSIZ		opt_param.h
 
 # Generic SCSI options.
 CAM_MAX_HIGHPOWER	opt_cam.h
 CAMDEBUG		opt_cam.h
 CAM_DEBUG_COMPILE	opt_cam.h
 CAM_DEBUG_DELAY		opt_cam.h
 CAM_DEBUG_BUS		opt_cam.h
 CAM_DEBUG_TARGET	opt_cam.h
 CAM_DEBUG_LUN		opt_cam.h
 CAM_DEBUG_FLAGS		opt_cam.h
 CAM_BOOT_DELAY		opt_cam.h
 SCSI_DELAY		opt_scsi.h
 SCSI_NO_SENSE_STRINGS	opt_scsi.h
 SCSI_NO_OP_STRINGS	opt_scsi.h
 
 # Options used only in cam/ata/ata_da.c
 ADA_TEST_FAILURE	opt_ada.h
 ATA_STATIC_ID		opt_ada.h
 
 # Options used only in cam/scsi/scsi_cd.c
 CHANGER_MIN_BUSY_SECONDS	opt_cd.h
 CHANGER_MAX_BUSY_SECONDS	opt_cd.h
 
 # Options used only in cam/scsi/scsi_sa.c.
 SA_IO_TIMEOUT		opt_sa.h
 SA_SPACE_TIMEOUT	opt_sa.h
 SA_REWIND_TIMEOUT	opt_sa.h
 SA_ERASE_TIMEOUT	opt_sa.h
 SA_1FM_AT_EOD		opt_sa.h
 
 # Options used only in cam/scsi/scsi_pt.c
 SCSI_PT_DEFAULT_TIMEOUT	opt_pt.h
 
 # Options used only in cam/scsi/scsi_ses.c
 SES_ENABLE_PASSTHROUGH	opt_ses.h
 
 # Options used in dev/sym/ (Symbios SCSI driver).
 SYM_SETUP_LP_PROBE_MAP	opt_sym.h	#-Low Priority Probe Map (bits)
 					# Allows the ncr to take precedence
 					# 1 (1<<0) -> 810a, 860
 					# 2 (1<<1) -> 825a, 875, 885, 895
 					# 4 (1<<2) -> 895a, 896, 1510d 
 SYM_SETUP_SCSI_DIFF	opt_sym.h	#-HVD support for 825a, 875, 885
 					# disabled:0 (default), enabled:1
 SYM_SETUP_PCI_PARITY	opt_sym.h	#-PCI parity checking
 					# disabled:0, enabled:1 (default)
 SYM_SETUP_MAX_LUN	opt_sym.h	#-Number of LUNs supported
 					# default:8, range:[1..64]
 
 # Options used only in pci/ncr.c
 SCSI_NCR_DEBUG		opt_ncr.h
 SCSI_NCR_MAX_SYNC	opt_ncr.h
 SCSI_NCR_MAX_WIDE	opt_ncr.h
 SCSI_NCR_MYADDR		opt_ncr.h
 
 # Options used only in dev/isp/*
 ISP_TARGET_MODE		opt_isp.h
 ISP_FW_CRASH_DUMP	opt_isp.h
 ISP_DEFAULT_ROLES	opt_isp.h
 ISP_INTERNAL_TARGET	opt_isp.h
 
 # Options used only in dev/iscsi
 ISCSI_INITIATOR_DEBUG	opt_iscsi_initiator.h
 
 # Net stuff.
 ACCEPT_FILTER_DATA
 ACCEPT_FILTER_DNS
 ACCEPT_FILTER_HTTP
 ALTQ			opt_global.h
 ALTQ_CBQ		opt_altq.h
 ALTQ_CDNR		opt_altq.h
 ALTQ_DEBUG		opt_altq.h
 ALTQ_HFSC		opt_altq.h
 ALTQ_NOPCC		opt_altq.h
 ALTQ_PRIQ		opt_altq.h
 ALTQ_RED		opt_altq.h
 ALTQ_RIO		opt_altq.h
 BOOTP			opt_bootp.h
 BOOTP_BLOCKSIZE		opt_bootp.h
 BOOTP_COMPAT		opt_bootp.h
 BOOTP_NFSROOT		opt_bootp.h
 BOOTP_NFSV3		opt_bootp.h
 BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DEV_ENC			opt_enc.h
 DEV_PF			opt_pf.h
 DEV_PFLOG		opt_pf.h
 DEV_PFSYNC		opt_pf.h
 DEV_VLAN		opt_vlan.h
 DUMMYNET		opt_ipdn.h
 ETHER_8022		opt_ef.h
 ETHER_8023		opt_ef.h
 ETHER_II		opt_ef.h
 ETHER_SNAP		opt_ef.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT
 IPFILTER		opt_ipfilter.h
 IPFILTER_DEFAULT_BLOCK	opt_ipfilter.h
 IPFILTER_LOG		opt_ipfilter.h
 IPFILTER_LOOKUP		opt_ipfilter.h
 IPFIREWALL		opt_ipfw.h
 IPFIREWALL_DEFAULT_TO_ACCEPT	opt_ipfw.h
 IPFIREWALL_NAT		opt_ipfw.h
 IPFIREWALL_VERBOSE	opt_ipfw.h
 IPFIREWALL_VERBOSE_LIMIT	opt_ipfw.h
 IPSEC			opt_ipsec.h
 IPSEC_DEBUG		opt_ipsec.h
 IPSEC_FILTERTUNNEL	opt_ipsec.h
 IPSEC_NAT_T		opt_ipsec.h
 IPSTEALTH
 IPX
 KRPC
 LIBALIAS
 LIBMBPOOL
 LIBMCHAIN
 MBUF_PROFILING
 MBUF_STRESS_TEST
 MROUTING		opt_mrouting.h
 NETATALK		opt_atalk.h
 NFSLOCKD
 PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
 SIFTR
 TCP_OFFLOAD		opt_inet.h # Enable code to dispatch TCP offloading
 TCP_SIGNATURE		opt_inet.h
 VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 FLOWTABLE		opt_route.h
 FLOWTABLE_HASH_ALL	opt_route.h
 
 #
 # SCTP
 #
 SCTP			opt_sctp.h
 SCTP_DEBUG		opt_sctp.h # Enable debug printfs
 SCTP_WITH_NO_CSUM	opt_sctp.h # Use this at your peril
 SCTP_LOCK_LOGGING	opt_sctp.h # Log to KTR lock activity
 SCTP_MBUF_LOGGING	opt_sctp.h # Log to KTR general mbuf aloc/free
 SCTP_MBCNT_LOGGING	opt_sctp.h # Log to KTR mbcnt activity
 SCTP_PACKET_LOGGING	opt_sctp.h # Log to a packet buffer last N packets
 SCTP_LTRACE_CHUNKS	opt_sctp.h # Log to KTR chunks processed
 SCTP_LTRACE_ERRORS	opt_sctp.h # Log to KTR error returns.
 SCTP_USE_PERCPU_STAT	opt_sctp.h # Use per cpu stats.
 SCTP_MCORE_INPUT	opt_sctp.h # Have multiple input threads for input mbufs
 SCTP_LOCAL_TRACE_BUF	opt_sctp.h # Use tracebuffer exported via sysctl
 SCTP_DETAILED_STR_STATS	opt_sctp.h # Use per PR-SCTP policy stream stats
 #
 #
 #
 
 # Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
 # Each netgraph node type can be either be compiled into the kernel
 # or loaded dynamically. To get the former, include the corresponding
 # option below. Each type has its own man page, e.g. ng_async(4).
 NETGRAPH
 NETGRAPH_DEBUG		opt_netgraph.h
 NETGRAPH_ASYNC		opt_netgraph.h
 NETGRAPH_ATMLLC		opt_netgraph.h
 NETGRAPH_ATM_ATMPIF	opt_netgraph.h
 NETGRAPH_BLUETOOTH	opt_netgraph.h
 NETGRAPH_BLUETOOTH_BT3C	opt_netgraph.h
 NETGRAPH_BLUETOOTH_H4	opt_netgraph.h
 NETGRAPH_BLUETOOTH_HCI	opt_netgraph.h
 NETGRAPH_BLUETOOTH_L2CAP	opt_netgraph.h
 NETGRAPH_BLUETOOTH_SOCKET	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBT	opt_netgraph.h
 NETGRAPH_BLUETOOTH_UBTBCMFW	opt_netgraph.h
 NETGRAPH_BPF		opt_netgraph.h
 NETGRAPH_BRIDGE		opt_netgraph.h
 NETGRAPH_CAR		opt_netgraph.h
 NETGRAPH_CISCO		opt_netgraph.h
 NETGRAPH_DEFLATE	opt_netgraph.h
 NETGRAPH_DEVICE		opt_netgraph.h
 NETGRAPH_ECHO		opt_netgraph.h
 NETGRAPH_EIFACE		opt_netgraph.h
 NETGRAPH_ETHER		opt_netgraph.h
 NETGRAPH_ETHER_ECHO	opt_netgraph.h
 NETGRAPH_FEC		opt_netgraph.h
 NETGRAPH_FRAME_RELAY	opt_netgraph.h
 NETGRAPH_GIF		opt_netgraph.h
 NETGRAPH_GIF_DEMUX	opt_netgraph.h
 NETGRAPH_HOLE		opt_netgraph.h
 NETGRAPH_IFACE		opt_netgraph.h
 NETGRAPH_IP_INPUT	opt_netgraph.h
 NETGRAPH_IPFW		opt_netgraph.h
 NETGRAPH_KSOCKET	opt_netgraph.h
 NETGRAPH_L2TP		opt_netgraph.h
 NETGRAPH_LMI		opt_netgraph.h
 # MPPC compression requires proprietary files (not included)
 NETGRAPH_MPPC_COMPRESSION	opt_netgraph.h
 NETGRAPH_MPPC_ENCRYPTION	opt_netgraph.h
 NETGRAPH_NAT		opt_netgraph.h
 NETGRAPH_NETFLOW	opt_netgraph.h
 NETGRAPH_ONE2MANY	opt_netgraph.h
 NETGRAPH_PATCH		opt_netgraph.h
 NETGRAPH_PIPE		opt_netgraph.h
 NETGRAPH_PPP		opt_netgraph.h
 NETGRAPH_PPPOE		opt_netgraph.h
 NETGRAPH_PPTPGRE	opt_netgraph.h
 NETGRAPH_PRED1		opt_netgraph.h
 NETGRAPH_RFC1490	opt_netgraph.h
 NETGRAPH_SOCKET		opt_netgraph.h
 NETGRAPH_SPLIT		opt_netgraph.h
 NETGRAPH_SPPP		opt_netgraph.h
 NETGRAPH_TAG		opt_netgraph.h
 NETGRAPH_TCPMSS		opt_netgraph.h
 NETGRAPH_TEE		opt_netgraph.h
 NETGRAPH_TTY		opt_netgraph.h
 NETGRAPH_UI		opt_netgraph.h
 NETGRAPH_VJC		opt_netgraph.h
 NETGRAPH_VLAN		opt_netgraph.h
 
 # NgATM options
 NGATM_ATM		opt_netgraph.h
 NGATM_ATMBASE		opt_netgraph.h
 NGATM_SSCOP		opt_netgraph.h
 NGATM_SSCFU		opt_netgraph.h
 NGATM_UNI		opt_netgraph.h
 NGATM_CCATM		opt_netgraph.h
 
 # DRM options
 DRM_DEBUG		opt_drm.h
 
 TI_SF_BUF_JUMBO		opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h
 
 # XXX Conflict: # of devices vs network protocol (Native ATM).
 # This makes "atm.h" unusable.
 NATM
 
 # DPT driver debug flags
 DPT_MEASURE_PERFORMANCE	opt_dpt.h
 DPT_RESET_HBA		opt_dpt.h
 
 # Misc debug flags.  Most of these should probably be replaced with
 # 'DEBUG', and then let people recompile just the interesting modules
 # with 'make CC="cc -DDEBUG"'.
 CLUSTERDEBUG		opt_debug_cluster.h
 DEBUG_1284		opt_ppb_1284.h
 VP0_DEBUG		opt_vpo.h
 LPT_DEBUG		opt_lpt.h
 PLIP_DEBUG		opt_plip.h
 LOCKF_DEBUG		opt_debug_lockf.h
 NETATALKDEBUG		opt_atalk.h
 SI_DEBUG		opt_debug_si.h
 
 # Fb options
 FB_DEBUG		opt_fb.h
 FB_INSTALL_CDEV		opt_fb.h
 
 # ppbus related options
 PERIPH_1284		opt_ppb_1284.h
 DONTPROBE_1284		opt_ppb_1284.h
 
 # smbus related options
 ENABLE_ALART		opt_intpm.h
 
 # These cause changes all over the kernel
 BLKDEV_IOSIZE		opt_global.h
 BURN_BRIDGES		opt_global.h
 DEBUG			opt_global.h
 DEBUG_LOCKS		opt_global.h
 DEBUG_VFS_LOCKS		opt_global.h
 DFLTPHYS		opt_global.h
 DIAGNOSTIC		opt_global.h
 INVARIANT_SUPPORT	opt_global.h
 INVARIANTS		opt_global.h
 MAXCPU			opt_global.h
 MAXMEMDOM		opt_global.h
 MAXPHYS			opt_global.h
 MCLSHIFT		opt_global.h
 MUTEX_DEBUG		opt_global.h
 MUTEX_NOINLINE		opt_global.h
 LOCK_PROFILING		opt_global.h
 LOCK_PROFILING_FAST	opt_global.h
 MSIZE			opt_global.h
 REGRESSION		opt_global.h
 RWLOCK_NOINLINE		opt_global.h
 SX_NOINLINE		opt_global.h
 VFS_BIO_DEBUG		opt_global.h
 
 # These are VM related options
 VM_KMEM_SIZE		opt_vm.h
 VM_KMEM_SIZE_SCALE	opt_vm.h
 VM_KMEM_SIZE_MAX	opt_vm.h
 VM_NRESERVLEVEL		opt_vm.h
 VM_LEVEL_0_ORDER	opt_vm.h
 NO_SWAPPING		opt_vm.h
 MALLOC_MAKE_FAILURES	opt_vm.h
 MALLOC_PROFILE		opt_vm.h
 MALLOC_DEBUG_MAXZONES	opt_vm.h
 
 # The MemGuard replacement allocator used for tamper-after-free detection
 DEBUG_MEMGUARD		opt_vm.h
 
 # The RedZone malloc(9) protection
 DEBUG_REDZONE		opt_vm.h
 
 # Standard SMP options
 SMP			opt_global.h
 
 # Size of the kernel message buffer
 MSGBUF_SIZE		opt_msgbuf.h
 
 # NFS options
 NFS_MINATTRTIMO		opt_nfs.h
 NFS_MAXATTRTIMO		opt_nfs.h
 NFS_MINDIRATTRTIMO	opt_nfs.h
 NFS_MAXDIRATTRTIMO	opt_nfs.h
 NFS_GATHERDELAY		opt_nfs.h
 NFS_WDELAYHASHSIZ	opt_nfs.h
 NFS_DEBUG		opt_nfs.h
 
 # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver
 OVERRIDE_CARD			opt_bktr.h
 OVERRIDE_TUNER			opt_bktr.h
 OVERRIDE_DBX			opt_bktr.h
 OVERRIDE_MSP			opt_bktr.h
 BROOKTREE_SYSTEM_DEFAULT	opt_bktr.h
 BROOKTREE_ALLOC_PAGES		opt_bktr.h
 BKTR_OVERRIDE_CARD		opt_bktr.h
 BKTR_OVERRIDE_TUNER		opt_bktr.h
 BKTR_OVERRIDE_DBX		opt_bktr.h
 BKTR_OVERRIDE_MSP		opt_bktr.h
 BKTR_SYSTEM_DEFAULT		opt_bktr.h
 BKTR_ALLOC_PAGES		opt_bktr.h
 BKTR_USE_PLL			opt_bktr.h	
 BKTR_GPIO_ACCESS		opt_bktr.h
 BKTR_NO_MSP_RESET		opt_bktr.h
 BKTR_430_FX_MODE		opt_bktr.h
 BKTR_SIS_VIA_MODE		opt_bktr.h
 BKTR_USE_FREEBSD_SMBUS		opt_bktr.h
 BKTR_NEW_MSP34XX_DRIVER		opt_bktr.h
 
 # Options for uart(4)
 UART_PPS_ON_CTS		opt_uart.h
 UART_POLL_FREQ		opt_uart.h
 
 # options for bus/device framework
 BUS_DEBUG		opt_bus.h
 
 # options for USB support
 USB_DEBUG		opt_usb.h
 USB_HOST_ALIGN		opt_usb.h
 USB_REQ_DEBUG		opt_usb.h
 USB_TEMPLATE		opt_usb.h
 USB_VERBOSE		opt_usb.h
 USB_DMA_SINGLE_ALLOC	opt_usb.h
 USB_EHCI_BIG_ENDIAN_DESC	opt_usb.h
 U3G_DEBUG		opt_u3g.h
 UKBD_DFLT_KEYMAP	opt_ukbd.h
 UPLCOM_INTR_INTERVAL	opt_uplcom.h
 UVSCOM_DEFAULT_OPKTSIZE	opt_uvscom.h
 UVSCOM_INTR_INTERVAL	opt_uvscom.h
 
 # Embedded system options
 INIT_PATH
 
 ROOTDEVNAME
 
 FDC_DEBUG		opt_fdc.h
 PCFCLOCK_VERBOSE	opt_pcfclock.h
 PCFCLOCK_MAX_RETRIES	opt_pcfclock.h
 
 KTR			opt_global.h
 KTR_ALQ			opt_ktr.h
 KTR_MASK		opt_ktr.h
 KTR_CPUMASK		opt_ktr.h
 KTR_COMPILE		opt_global.h
 KTR_BOOT_ENTRIES	opt_global.h
 KTR_ENTRIES		opt_global.h
 KTR_VERBOSE		opt_ktr.h
 WITNESS			opt_global.h
 WITNESS_KDB		opt_witness.h
 WITNESS_NO_VNODE	opt_witness.h
 WITNESS_SKIPSPIN	opt_witness.h
 OPENSOLARIS_WITNESS	opt_global.h
 
 # options for ACPI support
 ACPI_DEBUG		opt_acpi.h
 ACPI_MAX_TASKS		opt_acpi.h
 ACPI_MAX_THREADS	opt_acpi.h
 ACPI_DMAR		opt_acpi.h
 
 # ISA support
 DEV_ISA			opt_isa.h
 ISAPNP			opt_isa.h
 
 # various 'device presence' options.
 DEV_BPF			opt_bpf.h
 DEV_NETMAP		opt_global.h
 DEV_MCA			opt_mca.h
 DEV_CARP		opt_carp.h
 DEV_SPLASH		opt_splash.h
 
 # EISA support
 DEV_EISA		opt_eisa.h
 EISA_SLOTS		opt_eisa.h
 
 # ed driver
 ED_HPP			opt_ed.h
 ED_3C503		opt_ed.h
 ED_SIC			opt_ed.h
 
 # bce driver
 BCE_DEBUG		opt_bce.h
 BCE_NVRAM_WRITE_SUPPORT	opt_bce.h
 
 SOCKBUF_DEBUG		opt_global.h
 
 
 # options for ubsec driver
 UBSEC_DEBUG		opt_ubsec.h
 UBSEC_RNDTEST		opt_ubsec.h
 UBSEC_NO_RNG		opt_ubsec.h
 
 # options for hifn driver
 HIFN_DEBUG		opt_hifn.h
 HIFN_RNDTEST		opt_hifn.h
 
 # options for safenet driver
 SAFE_DEBUG		opt_safe.h
 SAFE_NO_RNG		opt_safe.h
 SAFE_RNDTEST		opt_safe.h
 
 # syscons/vt options
 MAXCONS			opt_syscons.h
 SC_ALT_MOUSE_IMAGE	opt_syscons.h
 SC_CUT_SPACES2TABS	opt_syscons.h
 SC_CUT_SEPCHARS		opt_syscons.h
 SC_DEBUG_LEVEL		opt_syscons.h
 SC_DFLT_FONT		opt_syscons.h
 SC_DISABLE_KDBKEY	opt_syscons.h
 SC_DISABLE_REBOOT	opt_syscons.h
 SC_HISTORY_SIZE		opt_syscons.h
 SC_KERNEL_CONS_ATTR	opt_syscons.h
 SC_KERNEL_CONS_REV_ATTR	opt_syscons.h
 SC_MOUSE_CHAR		opt_syscons.h
 SC_NO_CUTPASTE		opt_syscons.h
 SC_NO_FONT_LOADING	opt_syscons.h
 SC_NO_HISTORY		opt_syscons.h
 SC_NO_MODE_CHANGE	opt_syscons.h
 SC_NO_SUSPEND_VTYSWITCH	opt_syscons.h
 SC_NO_SYSMOUSE		opt_syscons.h
 SC_NORM_ATTR		opt_syscons.h
 SC_NORM_REV_ATTR	opt_syscons.h
 SC_PIXEL_MODE		opt_syscons.h
 SC_RENDER_DEBUG		opt_syscons.h
 SC_TWOBUTTON_MOUSE	opt_syscons.h
 VT_ALT_TO_ESC_HACK	opt_syscons.h
 VT_FB_DEFAULT_WIDTH	opt_syscons.h
 VT_FB_DEFAULT_HEIGHT	opt_syscons.h
 VT_MAXWINDOWS		opt_syscons.h
 VT_TWOBUTTON_MOUSE	opt_syscons.h
 DEV_SC			opt_syscons.h
 DEV_VT			opt_syscons.h
 
 # teken terminal emulator options
 TEKEN_CONS25		opt_teken.h
 TEKEN_UTF8		opt_teken.h
 TERMINAL_KERN_ATTR	opt_teken.h
 TERMINAL_NORM_ATTR	opt_teken.h
 
 # options for printf
 PRINTF_BUFR_SIZE	opt_printf.h
 
 # kbd options
 KBD_DISABLE_KEYMAP_LOAD	opt_kbd.h
 KBD_INSTALL_CDEV	opt_kbd.h
 KBD_MAXRETRY		opt_kbd.h
 KBD_MAXWAIT		opt_kbd.h
 KBD_RESETDELAY		opt_kbd.h
 KBDIO_DEBUG		opt_kbd.h
 
 # options for the Atheros driver
 ATH_DEBUG		opt_ath.h
 ATH_TXBUF		opt_ath.h
 ATH_RXBUF		opt_ath.h
 ATH_DIAGAPI		opt_ath.h
 ATH_TX99_DIAG		opt_ath.h
 ATH_ENABLE_11N		opt_ath.h
 ATH_ENABLE_DFS		opt_ath.h
 ATH_EEPROM_FIRMWARE	opt_ath.h
 ATH_ENABLE_RADIOTAP_VENDOR_EXT	opt_ath.h
 ATH_DEBUG_ALQ		opt_ath.h
 ATH_KTR_INTR_DEBUG	opt_ath.h
 
 # options for the Atheros hal
 AH_SUPPORT_AR5416	opt_ah.h
 # XXX For now, this breaks non-AR9130 chipsets, so only use it
 # XXX when actually targetting AR9130.
 AH_SUPPORT_AR9130	opt_ah.h
 
 # This is required for AR933x SoC support
 AH_SUPPORT_AR9330	opt_ah.h
 AH_SUPPORT_AR9340	opt_ah.h
 
 AH_DEBUG		opt_ah.h
 AH_ASSERT		opt_ah.h
 AH_DEBUG_ALQ		opt_ah.h
 AH_REGOPS_FUNC		opt_ah.h
 AH_WRITE_REGDOMAIN	opt_ah.h
 AH_DEBUG_COUNTRY	opt_ah.h
 AH_WRITE_EEPROM		opt_ah.h
 AH_PRIVATE_DIAG		opt_ah.h
 AH_NEED_DESC_SWAP	opt_ah.h
 AH_USE_INIPDGAIN	opt_ah.h
 AH_MAXCHAN		opt_ah.h
 AH_RXCFG_SDMAMW_4BYTES	opt_ah.h
 AH_INTERRUPT_DEBUGGING	opt_ah.h
 # AR5416 and later interrupt mitigation
 # XXX do not use this for AR9130
 AH_AR5416_INTERRUPT_MITIGATION	opt_ah.h
 
 # options for the Broadcom BCM43xx driver (bwi)
 BWI_DEBUG		opt_bwi.h
 BWI_DEBUG_VERBOSE	opt_bwi.h
 
 # options for the Marvell 8335 wireless driver
 MALO_DEBUG		opt_malo.h
 MALO_TXBUF		opt_malo.h
 MALO_RXBUF		opt_malo.h
 
 # options for the Marvell wireless driver
 MWL_DEBUG		opt_mwl.h
 MWL_TXBUF		opt_mwl.h
 MWL_RXBUF		opt_mwl.h
 MWL_DIAGAPI		opt_mwl.h
 MWL_AGGR_SIZE		opt_mwl.h
 MWL_TX_NODROP		opt_mwl.h
 
 # Options for the Intel 802.11n wireless driver
 IWN_DEBUG		opt_iwn.h
 
 # dcons options 
 DCONS_BUF_SIZE		opt_dcons.h
 DCONS_POLL_HZ		opt_dcons.h
 DCONS_FORCE_CONSOLE	opt_dcons.h
 DCONS_FORCE_GDB		opt_dcons.h
 
 # HWPMC options
 HWPMC_DEBUG		opt_global.h
 HWPMC_HOOKS
 HWPMC_MIPS_BACKTRACE 	opt_hwpmc_hooks.h
 
 # XBOX options for FreeBSD/i386, but some files are MI
 XBOX			opt_xbox.h
 
 # Interrupt filtering
 INTR_FILTER
 
 # 802.11 support layer
 IEEE80211_DEBUG		opt_wlan.h
 IEEE80211_DEBUG_REFCNT	opt_wlan.h
 IEEE80211_AMPDU_AGE	opt_wlan.h
 IEEE80211_SUPPORT_MESH	opt_wlan.h
 IEEE80211_SUPPORT_SUPERG	opt_wlan.h
 IEEE80211_SUPPORT_TDMA	opt_wlan.h
 IEEE80211_ALQ		opt_wlan.h
 IEEE80211_DFS_DEBUG	opt_wlan.h
 
 # 802.11 TDMA support
 TDMA_SLOTLEN_DEFAULT	opt_tdma.h
 TDMA_SLOTCNT_DEFAULT	opt_tdma.h
 TDMA_BINTVAL_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11B_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11G_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11A_DEFAULT	opt_tdma.h
 TDMA_TXRATE_TURBO_DEFAULT	opt_tdma.h
 TDMA_TXRATE_HALF_DEFAULT	opt_tdma.h
 TDMA_TXRATE_QUARTER_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NA_DEFAULT	opt_tdma.h
 TDMA_TXRATE_11NG_DEFAULT	opt_tdma.h
 
 # Network stack virtualization options
 VIMAGE			opt_global.h
 VNET_DEBUG		opt_global.h
 
 # Common Flash Interface (CFI) options
 CFI_SUPPORT_STRATAFLASH	opt_cfi.h
 CFI_ARMEDANDDANGEROUS	opt_cfi.h
 
 # Sound options
 SND_DEBUG		opt_snd.h
 SND_DIAGNOSTIC		opt_snd.h
 SND_FEEDER_MULTIFORMAT	opt_snd.h
 SND_FEEDER_FULL_MULTIFORMAT	opt_snd.h
 SND_FEEDER_RATE_HP	opt_snd.h
 SND_PCM_64		opt_snd.h
 SND_OLDSTEREO		opt_snd.h
 
 X86BIOS
 
 # Flattened device tree options
 FDT		opt_platform.h
 FDT_DTB_STATIC	opt_platform.h
 
 # OFED Infiniband stack
 OFED		opt_ofed.h
 OFED_DEBUG_INIT	opt_ofed.h
 SDP		opt_ofed.h
 SDP_DEBUG	opt_ofed.h
 IPOIB		opt_ofed.h
 IPOIB_DEBUG	opt_ofed.h
 IPOIB_CM	opt_ofed.h
 
 # Resource Accounting
 RACCT		opt_global.h
+RACCT_DEFAULT_TO_DISABLED	opt_global.h
 
 # Resource Limits
 RCTL		opt_global.h
 
 # Random number generator(s)
 RANDOM_YARROW	opt_random.h
 RANDOM_FORTUNA	opt_random.h
 RANDOM_DEBUG	opt_random.h
 RANDOM_RWFILE	opt_random.h
 
 # Intel em(4) driver
 EM_MULTIQUEUE	opt_em.h
Index: stable/10/sys/i386/conf/GENERIC
===================================================================
--- stable/10/sys/i386/conf/GENERIC	(revision 284664)
+++ stable/10/sys/i386/conf/GENERIC	(revision 284665)
@@ -1,370 +1,373 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/i386
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		I486_CPU
 cpu		I586_CPU
 cpu		I686_CPU
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 makeoptions	WITH_CTF=1		# Run ctfconvert(1) for DTrace support
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# New Network Filesystem Client
 options 	NFSD			# New Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	PROCDESC		# Support for process descriptors
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
+options 	RACCT			# Resource accounting framework
+options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
+options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 
 # To make an SMP kernel, the next two lines are needed
 options 	SMP			# Symmetric MultiProcessor Kernel
 device		apic			# I/O APIC
 
 # CPU frequency control
 device		cpufreq
 
 # Bus support.
 device		acpi
 device		eisa
 device		pci
 
 # Floppy drives
 device		fdc
 
 # ATA controllers
 device		ahci			# AHCI-compatible SATA controllers
 device		ata			# Legacy ATA/SATA controllers
 options 	ATA_STATIC_ID		# Static device numbering
 device		mvs			# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis			# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahb			# EISA AHA1742 family
 device		ahc			# AHA2940 and onboard AIC7xxx devices
 options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~128k to driver.
 device		ahd			# AHA39320/29320 and onboard AIC79xx devices
 options 	AHD_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~215k to driver.
 device		esp			# AMD Am53C974 (Tekram DC-390(T))
 device		hptiop			# Highpoint RocketRaid 3xxx series
 device		isp			# Qlogic family
 #device		ispfw			# Firmware for QLogic HBAs- normally a module
 device		mpt			# LSI-Logic MPT-Fusion
 device		mps			# LSI-Logic MPT-Fusion 2
 device		mpr			# LSI-Logic MPT-Fusion 3
 #device		ncr			# NCR/Symbios Logic
 device		sym			# NCR/Symbios Logic (newer chipsets + those of `ncr')
 device		trm			# Tekram DC395U/UW/F DC315U adapters
 
 device		adv			# Advansys SCSI adapters
 device		adw			# Advansys wide SCSI adapters
 device		aha			# Adaptec 154x SCSI adapters
 device		aic			# Adaptec 15[012]x SCSI adapters, AIC-6[23]60.
 device		bt			# Buslogic/Mylex MultiMaster SCSI adapters
 
 device		ncv			# NCR 53C500
 device		nsp			# Workbit Ninja SCSI-3
 device		stg			# TMC 18C30/18C50
 device		isci			# Intel C600 SAS controller
 
 # ATA/SCSI peripherals
 device		scbus			# SCSI bus (required for ATA/SCSI)
 device		ch			# SCSI media changers
 device		da			# Direct Access (disks)
 device		sa			# Sequential Access (tape etc)
 device		cd			# CD
 device		pass			# Passthrough device (direct ATA/SCSI access)
 device		ses			# Enclosure Services (SES and SAF-TE)
 #device		ctl			# CAM Target Layer
 
 # RAID controllers interfaced to the SCSI subsystem
 device		amr			# AMI MegaRAID
 device		arcmsr			# Areca SATA II RAID
 device		asr			# DPT SmartRAID V, VI and Adaptec SCSI RAID
 device		ciss			# Compaq Smart RAID 5*
 device		dpt			# DPT Smartcache III, IV - See NOTES for options
 device		hptmv			# Highpoint RocketRAID 182x
 device		hptnr			# Highpoint DC7280, R750
 device		hptrr			# Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx
 device		hpt27xx			# Highpoint RocketRAID 27xx
 device		iir			# Intel Integrated RAID
 device		ips			# IBM (Adaptec) ServeRAID
 device		mly			# Mylex AcceleRAID/eXtremeRAID
 device		twa			# 3ware 9000 series PATA/SATA RAID
 device		tws			# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # RAID controllers
 device		aac			# Adaptec FSA RAID
 device		aacp			# SCSI passthrough for aac (requires CAM)
 device		aacraid			# Adaptec by PMC RAID
 device		ida			# Compaq Smart RAID
 device		mfi			# LSI MegaRAID SAS
 device		mlx			# Mylex DAC960 family
 device		mrsas			# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 device		pst			# Promise Supertrak SX6000
 device		twe			# 3ware ATA RAID
 
 # NVM Express (NVMe) support
 device		nvme			# base NVMe driver
 device		nvd			# expose NVMe namespace as disks, depends on nvme
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc			# AT keyboard controller
 device		atkbd			# AT keyboard
 device		psm			# PS/2 mouse
 
 device		kbdmux			# keyboard multiplexer
 
 device		vga			# VGA video card driver
 options 	VESA			# Add support for VESA BIOS Extensions (VBE)
 
 device		splash			# Splash screen and screen saver support
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 options 	SC_PIXEL_MODE		# add support for the raster text mode
 
 # vt is the new video console driver
 device		vt
 device		vt_vga
 
 device		agp			# support several AGP chipsets
 
 # Power management support (see NOTES for more options)
 #device		apm
 # Add suspend/resume support for the i8254.
 device		pmtimer
 
 # PCCARD (PCMCIA) support
 # PCMCIA and cardbus bridge support
 device		cbb			# cardbus (yenta) bridge
 device		pccard			# PC Card (16-bit) bus
 device		cardbus			# CardBus (32-bit) bus
 
 # Serial (COM) ports
 device		uart			# Generic UART driver
 
 # Parallel port
 device		ppc
 device		ppbus			# Parallel port bus (required)
 device		lpt			# Printer
 device		ppi			# Parallel port interface device
 #device		vpo			# Requires scbus and da
 
 device		puc			# Multi I/O cards and multi-channel UARTs
 
 # PCI Ethernet NICs.
 device		bxe			# Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE
 device		de			# DEC/Intel DC21x4x (``Tulip'')
 device		em			# Intel PRO/1000 Gigabit Ethernet Family
 device		igb			# Intel PRO/1000 PCIE Server Gigabit Family
 device		ixgb			# Intel PRO/10GbE Ethernet Card
 device		le			# AMD Am7900 LANCE and Am79C9xx PCnet
 device		ti			# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp			# 3Com 3cR990 (``Typhoon'')
 device		vx			# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
 device		miibus			# MII bus support
 device		ae			# Attansic/Atheros L2 FastEthernet
 device		age			# Attansic/Atheros L1 Gigabit Ethernet
 device		alc			# Atheros AR8131/AR8132 Ethernet
 device		ale			# Atheros AR8121/AR8113/AR8114 Ethernet
 device		bce			# Broadcom BCM5706/BCM5708 Gigabit Ethernet
 device		bfe			# Broadcom BCM440x 10/100 Ethernet
 device		bge			# Broadcom BCM570xx Gigabit Ethernet
 device		cas			# Sun Cassini/Cassini+ and NS DP83065 Saturn
 device		dc			# DEC/Intel 21143 and various workalikes
 device		et			# Agere ET1310 10/100/Gigabit Ethernet
 device		fxp			# Intel EtherExpress PRO/100B (82557, 82558)
 device		gem			# Sun GEM/Sun ERI/Apple GMAC
 device		hme			# Sun HME (Happy Meal Ethernet)
 device		jme			# JMicron JMC250 Gigabit/JMC260 Fast Ethernet
 device		lge			# Level 1 LXT1001 gigabit Ethernet
 device		msk			# Marvell/SysKonnect Yukon II Gigabit Ethernet
 device		nfe			# nVidia nForce MCP on-board Ethernet
 device		nge			# NatSemi DP83820 gigabit Ethernet
 #device		nve			# nVidia nForce MCP on-board Ethernet Networking
 device		pcn			# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re			# RealTek 8139C+/8169/8169S/8110S
 device		rl			# RealTek 8129/8139
 device		sf			# Adaptec AIC-6915 (``Starfire'')
 device		sge			# Silicon Integrated Systems SiS190/191
 device		sis			# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk			# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste			# Sundance ST201 (D-Link DFE-550TX)
 device		stge			# Sundance/Tamarack TC9021 gigabit Ethernet
 device		tl			# Texas Instruments ThunderLAN
 device		tx			# SMC EtherPower II (83c170 ``EPIC'')
 device		vge			# VIA VT612x gigabit Ethernet
 device		vr			# VIA Rhine, Rhine II
 device		vte			# DM&P Vortex86 RDC R6040 Fast Ethernet
 device		wb			# Winbond W89C840F
 device		xl			# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # ISA Ethernet NICs.  pccard NICs included.
 device		cs			# Crystal Semiconductor CS89x0 NIC
 # 'device ed' requires 'device miibus'
 device		ed			# NE[12]000, SMC Ultra, 3c503, DS8390 cards
 device		ex			# Intel EtherExpress Pro/10 and Pro/10+
 device		ep			# Etherlink III based cards
 device		fe			# Fujitsu MB8696x based cards
 device		ie			# EtherExpress 8/16, 3C507, StarLAN 10 etc.
 device		sn			# SMC's 9000 series of Ethernet chips
 device		xe			# Xircom pccard Ethernet
 
 # Wireless NIC cards
 device		wlan			# 802.11 support
 options 	IEEE80211_DEBUG		# enable debug msgs
 options 	IEEE80211_AMPDU_AGE	# age frames in AMPDU reorder q's
 options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
 device		wlan_wep		# 802.11 WEP support
 device		wlan_ccmp		# 802.11 CCMP support
 device		wlan_tkip		# 802.11 TKIP support
 device		wlan_amrr		# AMRR transmit rate control algorithm
 device		an			# Aironet 4500/4800 802.11 wireless NICs.
 device		ath			# Atheros NICs
 device		ath_pci			# Atheros pci/cardbus glue
 device		ath_hal			# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
 options 	AH_AR5416_INTERRUPT_MITIGATION # AR5416 interrupt mitigation
 options 	ATH_ENABLE_11N		# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample		# SampleRate tx rate control for ath
 #device		bwi			# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn			# Broadcom BCM43xx wireless NICs.
 device		ipw			# Intel 2100 wireless NICs.
 device		iwi			# Intel 2200BG/2225BG/2915ABG wireless NICs.
 device		iwn			# Intel 4965/1000/5000/6000 wireless NICs.
 device		malo			# Marvell Libertas wireless NICs.
 device		mwl			# Marvell 88W8363 802.11n wireless NICs.
 device		ral			# Ralink Technology RT2500 wireless NICs.
 device		wi			# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
 #device		wl			# Older non 802.11 Wavelan wireless NIC.
 device		wpi			# Intel 3945ABG wireless NICs.
 
 # Pseudo devices.
 device		loop			# Network loopback
 device		random			# Entropy device
 device		padlock_rng		# VIA Padlock RNG
 device		rdrand_rng		# Intel Bull Mountain RNG
 device		ether			# Ethernet support
 device		vlan			# 802.1Q VLAN support
 device		tun			# Packet tunnel.
 device		md			# Memory "disks"
 device		gif			# IPv6 and IPv4 tunneling
 device		faith			# IPv6-to-IPv4 relaying (translation)
 device		firmware		# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf			# Berkeley packet filter
 
 # USB support
 options 	USB_DEBUG		# enable debug msgs
 device		uhci			# UHCI PCI->USB interface
 device		ohci			# OHCI PCI->USB interface
 device		ehci			# EHCI PCI->USB interface (USB 2.0)
 device		xhci			# XHCI PCI->USB interface (USB 3.0)
 device		usb			# USB Bus (required)
 device		ukbd			# Keyboard
 device		umass			# Disks/Mass storage - Requires scbus and da
 
 # Sound support
 device		sound			# Generic sound driver (required)
 device		snd_cmi			# CMedia CMI8338/CMI8738
 device		snd_csa			# Crystal Semiconductor CS461x/428x
 device		snd_emu10kx		# Creative SoundBlaster Live! and Audigy
 device		snd_es137x		# Ensoniq AudioPCI ES137x
 device		snd_hda			# Intel High Definition Audio
 device		snd_ich			# Intel, NVidia and other ICH AC'97 Audio
 device		snd_via8233		# VIA VT8233x Audio
 
 # MMC/SD
 device		mmc			# MMC/SD bus
 device		mmcsd			# MMC/SD memory card
 device		sdhci			# Generic PCI SD Host Controller
 
 # VirtIO support
 device		virtio			# Generic VirtIO bus (required)
 device		virtio_pci		# VirtIO PCI device
 device		vtnet			# VirtIO Ethernet device
 device		virtio_blk		# VirtIO Block device
 device		virtio_scsi		# VirtIO SCSI device
 device		virtio_balloon		# VirtIO Memory Balloon device
 
 # HyperV drivers and enchancement support
 # NOTE: HYPERV depends on hyperv.  They must be added or removed together.
 options 	HYPERV			# Hyper-V kernel infrastructure
 device		hyperv			# HyperV drivers 
 
 # Xen HVM Guest Optimizations
 # NOTE: XENHVM depends on xenpci.  They must be added or removed together.
 options 	XENHVM			# Xen HVM kernel infrastructure
 device		xenpci			# Xen HVM Hypervisor services driver
 
 # VMware support
 device		vmx			# VMware VMXNET3 Ethernet
Index: stable/10/sys/kern/imgact_elf.c
===================================================================
--- stable/10/sys/kern/imgact_elf.c	(revision 284664)
+++ stable/10/sys/kern/imgact_elf.c	(revision 284665)
@@ -1,2185 +1,2187 @@
 /*-
  * Copyright (c) 2000 David O'Brien
  * Copyright (c) 1995-1996 Søren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_core.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
 #include <sys/user.h>
 
 #include <net/zlib.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #define ELF_NOTE_ROUNDSIZE	4
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
     const char *interp, int interp_name_len, int32_t *osrel);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
     size_t pagesize);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note,
     int32_t *osrel);
 static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static boolean_t __elfN(check_note)(struct image_params *imgp,
     Elf_Brandnote *checknote, int32_t *osrel);
 static vm_prot_t __elfN(trans_prot)(Elf_Word);
 static Elf_Word __elfN(untrans_prot)(vm_prot_t);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 #ifdef COMPRESS_USER_CORES
 static int compress_core(gzFile, char *, char *, unsigned int,
     struct thread * td);
 #endif
 #define CORE_BUF_SIZE	(16 * 1024)
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
     &__elfN(fallback_brand));
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0, "");
 
 int __elfN(nxstack) =
 #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */
 	1;
 #else
 	0;
 #endif
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
 
 #if __ELF_WORD_SIZE == 32
 #if defined(__amd64__) || defined(__ia64__)
 int i386_read_exec = 0;
 SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
     "enable execution from readable segments");
 #endif
 #endif
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	trunc_page_ps(va, ps)	((va) & ~(ps - 1))
 #define	round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
 #define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
 
 static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
 
 Elf_Brandnote __elfN(freebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
 	.hdr.n_descsz	= sizeof(int32_t),
 	.hdr.n_type	= 1,
 	.vendor		= FREEBSD_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= __elfN(freebsd_trans_osrel)
 };
 
 static boolean_t
 __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
 {
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	*osrel = *(const int32_t *)(p);
 
 	return (TRUE);
 }
 
 static const char GNU_ABI_VENDOR[] = "GNU";
 static int GNU_KFREEBSD_ABI_DESC = 3;
 
 Elf_Brandnote __elfN(kfreebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
 	.vendor		= GNU_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= kfreebsd_trans_osrel
 };
 
 static boolean_t
 kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
 {
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 
 	desc = (const Elf32_Word *)p;
 	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
 		return (FALSE);
 
 	/*
 	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
 	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
 	 */
 	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
 
 	return (TRUE);
 }
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS) {
 		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
 			__func__, entry);
 		return (-1);
 	}
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
     int interp_name_len, int32_t *osrel)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi;
 	boolean_t ret;
 	int i;
 
 	/*
 	 * We support four types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, (3) path of the `interp_path'
 	 * field, and (4) the ".note.ABI-tag" ELF section.
 	 */
 
 	/* Look for an ".note.ABI-tag" ELF section */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL)
 			continue;
 		if (hdr->e_machine == bi->machine && (bi->flags &
 		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
 			ret = __elfN(check_note)(imgp, bi->brand_note, osrel);
 			if (ret)
 				return (bi);
 		}
 	}
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
 			return (bi);
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
 				continue;
 			if (hdr->e_machine == bi->machine &&
 			    /* ELF image p_filesz includes terminating zero */
 			    strlen(bi->interp_path) + 1 == interp_name_len &&
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    == 0)
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand)
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_lock(map);
 	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (KERN_FAILURE);
 		}
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (end > start) {
 		if (offset & PAGE_MASK) {
 			/*
 			 * The mapping is not page aligned. This means we have
 			 * to copy the data. Sigh.
 			 */
 			rv = vm_map_find(map, NULL, 0, &start, end - start, 0,
 			    VMFS_NO_SPACE, prot | VM_PROT_WRITE, VM_PROT_ALL,
 			    0);
 			if (rv)
 				return (rv);
 			if (object == NULL)
 				return (KERN_SUCCESS);
 			for (; start < end; start += sz) {
 				sf = vm_imgact_map_page(object, offset);
 				if (sf == NULL)
 					return (KERN_FAILURE);
 				off = offset - trunc_page(offset);
 				sz = end - start;
 				if (sz > PAGE_SIZE - off)
 					sz = PAGE_SIZE - off;
 				error = copyout((caddr_t)sf_buf_kva(sf) + off,
 				    (caddr_t)start, sz);
 				vm_imgact_unmap_page(sf);
 				if (error) {
 					return (KERN_FAILURE);
 				}
 				offset += sz;
 			}
 			rv = KERN_SUCCESS;
 		} else {
 			vm_object_reference(object);
 			vm_map_lock(map);
 			rv = vm_map_insert(map, object, offset, start, end,
 			    prot, VM_PROT_ALL, cow);
 			vm_map_unlock(map);
 			if (rv != KERN_SUCCESS)
 				vm_object_deallocate(object);
 		}
 		return (rv);
 	} else {
 		return (KERN_SUCCESS);
 	}
 }
 
 static int
 __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
     size_t pagesize)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_offset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 	object = imgp->object;
 	map = &imgp->proc->p_vmspace->vm_map;
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second..
 	 */
 	if (memsz > filsz)
 		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
 	else
 		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz) {
 			return (0);
 		}
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
 	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
 	    map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr +
 		    map_len, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (EINVAL);
 		}
 	}
 
 	if (copy_len != 0) {
 		vm_offset_t off;
 
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page_ps(offset + filsz, pagesize) -
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (error);
 		}
 	}
 
 	/*
 	 * set it to the specified protection.
 	 * XXX had better undo the damage from pasting over the cracks here!
 	 */
 	vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
 	    map_len), prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry, size_t pagesize)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int error, i, numsegs;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * XXXJA: This check can go away once we are sufficiently confident
 	 * that the checks in namei() are correct.
 	 */
 	if (IN_CAPABILITY_MODE(curthread))
 		return (ECAPMODE);
 #endif
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	VOP_SET_TEXT(nd->ni_vp);
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
 			/* Loadable segment */
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize);
 			if (error != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_prot_t prot;
 	u_long text_size = 0, data_size = 0, total_size = 0;
 	u_long text_addr = 0, data_addr = 0;
 	u_long seg_size, seg_addr;
 	u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
 	int32_t osrel = 0;
 	int error = 0, i, n, interp_name_len = 0;
 	const char *interp = NULL, *newinterp = NULL;
 	Elf_Brandinfo *brand_info;
 	char *path;
 	struct sysentvec *sv;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		/* Only support headers in first page for now */
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr))
 		return (ENOEXEC);
 	n = 0;
 	baddr = 0;
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:
 			if (n == 0)
 				baddr = phdr[i].p_vaddr;
 			n++;
 			break;
 		case PT_INTERP:
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
 			    phdr[i].p_offset > PAGE_SIZE ||
 			    phdr[i].p_filesz > PAGE_SIZE - phdr[i].p_offset)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
 			interp_name_len = phdr[i].p_filesz;
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
 				imgp->stack_prot =
 				    __elfN(trans_prot)(phdr[i].p_flags);
 			imgp->stack_sz = phdr[i].p_memsz;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
 	    &osrel);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		return (ENOEXEC);
 	}
 	if (hdr->e_type == ET_DYN) {
 		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0)
 			return (ENOEXEC);
 		/*
 		 * Honour the base load address from the dso if it is
 		 * non-zero for some reason.
 		 */
 		if (baddr == 0)
 			et_dyn_addr = ET_DYN_LOAD_ADDR;
 		else
 			et_dyn_addr = 0;
 	} else
 		et_dyn_addr = 0;
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		newinterp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 *
 	 * The VV_TEXT flag prevents modifications to the executable while
 	 * the vnode is unlocked.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	error = exec_new_vmspace(imgp, sv);
 	imgp->proc->p_sysent = sv;
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		return (error);
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			if (phdr[i].p_memsz == 0)
 				break;
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    sv->sv_pagesize);
 			if (error != 0)
 				return (error);
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff +
 				    et_dyn_addr;
 
 			seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
 			seg_size = round_page(phdr[i].p_memsz +
 			    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
 
 			/*
 			 * Make the largest executable segment the official
 			 * text segment and all others data.
 			 *
 			 * Note that obreak() assumes that data_addr + 
 			 * data_size == end of data load area, and the ELF
 			 * file format expects segments to be sorted by
 			 * address.  If multiple data segments exist, the
 			 * last one will be used.
 			 */
 
 			if (phdr[i].p_flags & PF_X && text_size < seg_size) {
 				text_size = seg_size;
 				text_addr = seg_addr;
 			} else {
 				data_size = seg_size;
 				data_addr = seg_addr;
 			}
 			total_size += seg_size;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr + et_dyn_addr;
 			break;
 		default:
 			break;
 		}
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	entry = (u_long)hdr->e_entry + et_dyn_addr;
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
 	    text_size > maxtsiz ||
 	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM) ||
 	    racct_set(imgp->proc, RACCT_DATA, data_size) != 0 ||
 	    racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) {
 		PROC_UNLOCK(imgp->proc);
 		return (ENOMEM);
 	}
 
 	vmspace = imgp->proc->p_vmspace;
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(imgp->proc,
 	    RLIMIT_DATA));
 	PROC_UNLOCK(imgp->proc);
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		int have_interp = FALSE;
 		VOP_UNLOCK(imgp->vp, 0);
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			free(path, M_TEMP);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp && newinterp != NULL) {
 			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found\n", interp);
 			return (error);
 		}
 	} else
 		addr = et_dyn_addr;
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 	imgp->reloc_base = addr;
 	imgp->proc->p_osrel = osrel;
 
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Addr *base;
 	Elf_Addr *pos;
 
 	base = (Elf_Addr *)*stack_base;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	if (imgp->execpathp != 0)
 		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
 	AUXARGS_ENTRY(pos, AT_OSRELDATE,
 	    imgp->proc->p_ucred->cr_prison->pr_osreldate);
 	if (imgp->canary != 0) {
 		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
 		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
 	}
 	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
 	if (imgp->pagesizes != 0) {
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
 	if (imgp->sysent->sv_timekeep_base != 0) {
 		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
 		    imgp->sysent->sv_timekeep_base);
 	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword(base, (long)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
 
 struct note_info {
 	int		type;		/* Note type. */
 	outfunc_t 	outfunc; 	/* Output function. */
 	void		*outarg;	/* Argument for the output function. */
 	size_t		outsize;	/* Output size. */
 	TAILQ_ENTRY(note_info) link;	/* Link to the next note info. */
 };
 
 TAILQ_HEAD(note_info_list, note_info);
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_writable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
     int, void *, size_t, struct note_info_list *, size_t, gzFile);
 static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
     size_t *);
 static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
 static void __elfN(putnote)(struct note_info *, struct sbuf *);
 static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
 static int sbuf_drain_core_output(void *, const char *, int);
 static int sbuf_drain_count(void *arg, const char *data, int len);
 
 static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
 static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
 static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
 static void note_procstat_files(void *, struct sbuf *, size_t *);
 static void note_procstat_groups(void *, struct sbuf *, size_t *);
 static void note_procstat_osrel(void *, struct sbuf *, size_t *);
 static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
 static void note_procstat_umask(void *, struct sbuf *, size_t *);
 static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
 
 #ifdef COMPRESS_USER_CORES
 extern int compress_user_cores;
 extern int compress_user_cores_gzlevel;
 #endif
 
 static int
 core_output(struct vnode *vp, void *base, size_t len, off_t offset,
     struct ucred *active_cred, struct ucred *file_cred,
     struct thread *td, char *core_buf, gzFile gzfile) {
 
 	int error;
 	if (gzfile) {
 #ifdef COMPRESS_USER_CORES
 		error = compress_core(gzfile, base, core_buf, len, td);
 #else
 		panic("shouldn't be here");
 #endif
 	} else {
 		error = vn_rdwr_inchunks(UIO_WRITE, vp, base, len, offset,
 		    UIO_USERSPACE, IO_UNIT | IO_DIRECT, active_cred, file_cred,
 		    NULL, td);
 	}
 	return (error);
 }
 
 /* Coredump output parameters for sbuf drain routine. */
 struct sbuf_drain_core_params {
 	off_t		offset;
 	struct ucred	*active_cred;
 	struct ucred	*file_cred;
 	struct thread	*td;
 	struct vnode	*vp;
 #ifdef COMPRESS_USER_CORES
 	gzFile		gzfile;
 #endif
 };
 
 /*
  * Drain into a core file.
  */
 static int
 sbuf_drain_core_output(void *arg, const char *data, int len)
 {
 	struct sbuf_drain_core_params *p;
 	int error, locked;
 
 	p = (struct sbuf_drain_core_params *)arg;
 
 	/*
 	 * Some kern_proc out routines that print to this sbuf may
 	 * call us with the process lock held. Draining with the
 	 * non-sleepable lock held is unsafe. The lock is needed for
 	 * those routines when dumping a live process. In our case we
 	 * can safely release the lock before draining and acquire
 	 * again after.
 	 */
 	locked = PROC_LOCKED(p->td->td_proc);
 	if (locked)
 		PROC_UNLOCK(p->td->td_proc);
 #ifdef COMPRESS_USER_CORES
 	if (p->gzfile != Z_NULL)
 		error = compress_core(p->gzfile, NULL, __DECONST(char *, data),
 		    len, p->td);
 	else
 #endif
 		error = vn_rdwr_inchunks(UIO_WRITE, p->vp,
 		    __DECONST(void *, data), len, p->offset, UIO_SYSSPACE,
 		    IO_UNIT | IO_DIRECT, p->active_cred, p->file_cred, NULL,
 		    p->td);
 	if (locked)
 		PROC_LOCK(p->td->td_proc);
 	if (error != 0)
 		return (-error);
 	p->offset += len;
 	return (len);
 }
 
 /*
  * Drain into a counter.
  */
 static int
 sbuf_drain_count(void *arg, const char *data __unused, int len)
 {
 	size_t *sizep;
 
 	sizep = (size_t *)arg;
 	*sizep += len;
 	return (len);
 }
 
 int
 __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	struct note_info_list notelst;
 	struct note_info *ninfo;
 	void *hdr;
 	size_t hdrsize, notesz, coresize;
 
 	gzFile gzfile = Z_NULL;
 	char *core_buf = NULL;
 #ifdef COMPRESS_USER_CORES
 	char gzopen_flags[8];
 	char *p;
 	int doing_compress = flags & IMGACT_CORE_COMPRESS;
 #endif
 
 	hdr = NULL;
 	TAILQ_INIT(&notelst);
 
 #ifdef COMPRESS_USER_CORES
         if (doing_compress) {
                 p = gzopen_flags;
                 *p++ = 'w';
                 if (compress_user_cores_gzlevel >= 0 &&
                     compress_user_cores_gzlevel <= 9)
                         *p++ = '0' + compress_user_cores_gzlevel;
                 *p = 0;
                 gzfile = gz_open("", gzopen_flags, vp);
                 if (gzfile == Z_NULL) {
                         error = EFAULT;
                         goto done;
                 }
                 core_buf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
                 if (!core_buf) {
                         error = ENOMEM;
                         goto done;
                 }
         }
 #endif
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_writable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Collect info about the core file header area.
 	 */
 	hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
 	__elfN(prepare_notes)(td, &notelst, &notesz);
 	coresize = round_page(hdrsize + notesz) + seginfo.size;
 
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	error = racct_add(td->td_proc, RACCT_CORE, coresize);
-	PROC_UNLOCK(td->td_proc);
-	if (error != 0) {
-		error = EFAULT;
-		goto done;
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		error = racct_add(td->td_proc, RACCT_CORE, coresize);
+		PROC_UNLOCK(td->td_proc);
+		if (error != 0) {
+			error = EFAULT;
+			goto done;
+		}
 	}
 #endif
 	if (coresize >= limit) {
 		error = EFAULT;
 		goto done;
 	}
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	if (hdr == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize,
 	    &notelst, notesz, gzfile);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = round_page(hdrsize + notesz);
 		for (i = 0; i < seginfo.count; i++) {
 			error = core_output(vp, (caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, cred, NOCRED, curthread, core_buf, gzfile);
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 	}
 	if (error) {
 		log(LOG_WARNING,
 		    "Failed to write core file for process %s (error %d)\n",
 		    curproc->p_comm, error);
 	}
 
 done:
 #ifdef COMPRESS_USER_CORES
 	if (core_buf)
 		free(core_buf, M_TEMP);
 	if (gzfile)
 		gzclose(gzfile);
 #endif
 	while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
 		TAILQ_REMOVE(&notelst, ninfo, link);
 		free(ninfo, M_TEMP);
 	}
 	if (hdr != NULL)
 		free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_writable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_writable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_writable_segment(td, func, closure)
 	struct thread *td;
 	segment_callback func;
 	void *closure;
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_RLOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_RLOCK(backing_object);
 			VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE &&
 		    object->type != OBJT_PHYS;
 		VM_OBJECT_RUNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(struct thread *td, struct vnode *vp, struct ucred *cred,
     int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst,
     size_t notesz, gzFile gzfile)
 {
 	struct sbuf_drain_core_params params;
 	struct note_info *ninfo;
 	struct sbuf *sb;
 	int error;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	__elfN(puthdr)(td, hdr, hdrsize, numsegs, notesz);
 
 	params.offset = 0;
 	params.active_cred = cred;
 	params.file_cred = NOCRED;
 	params.td = td;
 	params.vp = vp;
 #ifdef COMPRESS_USER_CORES
 	params.gzfile = gzfile;
 #endif
 	sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
 	sbuf_set_drain(sb, sbuf_drain_core_output, &params);
 	sbuf_start_section(sb, NULL);
 	sbuf_bcat(sb, hdr, hdrsize);
 	TAILQ_FOREACH(ninfo, notelst, link)
 	    __elfN(putnote)(ninfo, sb);
 	/* Align up to a page boundary for the program segments. */
 	sbuf_end_section(sb, -1, PAGE_SIZE, 0);
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static void
 __elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
     size_t *sizep)
 {
 	struct proc *p;
 	struct thread *thr;
 	size_t size;
 
 	p = td->td_proc;
 	size = 0;
 
 	size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		size += register_note(list, NT_PRSTATUS,
 		    __elfN(note_prstatus), thr);
 		size += register_note(list, NT_FPREGSET,
 		    __elfN(note_fpregset), thr);
 		size += register_note(list, NT_THRMISC,
 		    __elfN(note_thrmisc), thr);
 		size += register_note(list, -1,
 		    __elfN(note_threadmd), thr);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	size += register_note(list, NT_PROCSTAT_PROC,
 	    __elfN(note_procstat_proc), p);
 	size += register_note(list, NT_PROCSTAT_FILES,
 	    note_procstat_files, p);
 	size += register_note(list, NT_PROCSTAT_VMMAP,
 	    note_procstat_vmmap, p);
 	size += register_note(list, NT_PROCSTAT_GROUPS,
 	    note_procstat_groups, p);
 	size += register_note(list, NT_PROCSTAT_UMASK,
 	    note_procstat_umask, p);
 	size += register_note(list, NT_PROCSTAT_RLIMIT,
 	    note_procstat_rlimit, p);
 	size += register_note(list, NT_PROCSTAT_OSREL,
 	    note_procstat_osrel, p);
 	size += register_note(list, NT_PROCSTAT_PSSTRINGS,
 	    __elfN(note_procstat_psstrings), p);
 	size += register_note(list, NT_PROCSTAT_AUXV,
 	    __elfN(note_procstat_auxv), p);
 
 	*sizep = size;
 }
 
 static void
 __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
     size_t notesz)
 {
 	Elf_Ehdr *ehdr;
 	Elf_Phdr *phdr;
 	struct phdr_closure phc;
 
 	ehdr = (Elf_Ehdr *)hdr;
 	phdr = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr));
 
 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
 	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 	ehdr->e_ident[EI_DATA] = ELF_DATA;
 	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 	ehdr->e_ident[EI_ABIVERSION] = 0;
 	ehdr->e_ident[EI_PAD] = 0;
 	ehdr->e_type = ET_CORE;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 	ehdr->e_machine = ELF_ARCH32;
 #else
 	ehdr->e_machine = ELF_ARCH;
 #endif
 	ehdr->e_version = EV_CURRENT;
 	ehdr->e_entry = 0;
 	ehdr->e_phoff = sizeof(Elf_Ehdr);
 	ehdr->e_flags = 0;
 	ehdr->e_ehsize = sizeof(Elf_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf_Phdr);
 	ehdr->e_phnum = numsegs + 1;
 	ehdr->e_shentsize = sizeof(Elf_Shdr);
 	ehdr->e_shnum = 0;
 	ehdr->e_shstrndx = SHN_UNDEF;
 
 	/*
 	 * Fill in the program header entries.
 	 */
 
 	/* The note segement. */
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = hdrsize;
 	phdr->p_vaddr = 0;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = notesz;
 	phdr->p_memsz = 0;
 	phdr->p_flags = PF_R;
 	phdr->p_align = ELF_NOTE_ROUNDSIZE;
 	phdr++;
 
 	/* All the writable segments from the program. */
 	phc.phdr = phdr;
 	phc.offset = round_page(hdrsize + notesz);
 	each_writable_segment(td, cb_put_phdr, &phc);
 }
 
 static size_t
 register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
 {
 	struct note_info *ninfo;
 	size_t size, notesize;
 
 	size = 0;
 	out(arg, NULL, &size);
 	ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
 	ninfo->type = type;
 	ninfo->outfunc = out;
 	ninfo->outarg = arg;
 	ninfo->outsize = size;
 	TAILQ_INSERT_TAIL(list, ninfo, link);
 
 	if (type == -1)
 		return (size);
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static size_t
 append_note_data(const void *src, void *dst, size_t len)
 {
 	size_t padded_len;
 
 	padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE);
 	if (dst != NULL) {
 		bcopy(src, dst, len);
 		bzero((char *)dst + len, padded_len - len);
 	}
 	return (padded_len);
 }
 
 size_t
 __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp)
 {
 	Elf_Note *note;
 	char *buf;
 	size_t notesize;
 
 	buf = dst;
 	if (buf != NULL) {
 		note = (Elf_Note *)buf;
 		note->n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 		note->n_descsz = size;
 		note->n_type = type;
 		buf += sizeof(*note);
 		buf += append_note_data(FREEBSD_ABI_VENDOR, buf,
 		    sizeof(FREEBSD_ABI_VENDOR));
 		append_note_data(src, buf, size);
 		if (descp != NULL)
 			*descp = buf;
 	}
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static void
 __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
 {
 	Elf_Note note;
 	ssize_t old_len;
 
 	if (ninfo->type == -1) {
 		ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 		return;
 	}
 
 	note.n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 	note.n_descsz = ninfo->outsize;
 	note.n_type = ninfo->type;
 
 	sbuf_bcat(sb, &note, sizeof(note));
 	sbuf_start_section(sb, &old_len);
 	sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR));
 	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (note.n_descsz == 0)
 		return;
 	sbuf_start_section(sb, &old_len);
 	ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 }
 
 /*
  * Miscellaneous note out functions.
  */
 
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 #include <compat/freebsd32/freebsd32.h>
 
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 typedef struct thrmisc32 elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	KERN_PROC_MASK32
 typedef struct kinfo_proc32 elf_kinfo_proc_t;
 typedef uint32_t elf_ps_strings_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 typedef thrmisc_t elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	0
 typedef struct kinfo_proc elf_kinfo_proc_t;
 typedef vm_offset_t elf_ps_strings_t;
 #endif
 
 static void
 __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	elf_prpsinfo_t *psinfo;
 
 	p = (struct proc *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
 		psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		/*
 		 * XXX - We don't fill in the command line arguments properly
 		 * yet.
 		 */
 		strlcpy(psinfo->pr_psargs, p->p_comm,
 		    sizeof(psinfo->pr_psargs));
 
 		sbuf_bcat(sb, psinfo, sizeof(*psinfo));
 		free(psinfo, M_TEMP);
 	}
 	*sizep = sizeof(*psinfo);
 }
 
 static void
 __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prstatus_t *status;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*status), ("invalid size"));
 		status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
 		status->pr_version = PRSTATUS_VERSION;
 		status->pr_statussz = sizeof(elf_prstatus_t);
 		status->pr_gregsetsz = sizeof(elf_gregset_t);
 		status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 		status->pr_osreldate = osreldate;
 		status->pr_cursig = td->td_proc->p_sig;
 		status->pr_pid = td->td_tid;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_regs32(td, &status->pr_reg);
 #else
 		fill_regs(td, &status->pr_reg);
 #endif
 		sbuf_bcat(sb, status, sizeof(*status));
 		free(status, M_TEMP);
 	}
 	*sizep = sizeof(*status);
 }
 
 static void
 __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prfpregset_t *fpregset;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
 		fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_fpregs32(td, fpregset);
 #else
 		fill_fpregs(td, fpregset);
 #endif
 		sbuf_bcat(sb, fpregset, sizeof(*fpregset));
 		free(fpregset, M_TEMP);
 	}
 	*sizep = sizeof(*fpregset);
 }
 
 static void
 __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_thrmisc_t thrmisc;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
 		bzero(&thrmisc._pad, sizeof(thrmisc._pad));
 		strcpy(thrmisc.pr_tname, td->td_name);
 		sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
 	}
 	*sizep = sizeof(thrmisc);
 }
 
 /*
  * Allow for MD specific notes, as well as any MD
  * specific preparations for writing MI notes.
  */
 static void
 __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	void *buf;
 	size_t size;
 
 	td = (struct thread *)arg;
 	size = *sizep;
 	if (size != 0 && sb != NULL)
 		buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
 	else
 		buf = NULL;
 	size = 0;
 	__elfN(dump_thread)(td, buf, &size);
 	KASSERT(sb == NULL || *sizep == size, ("invalid size"));
 	if (size != 0 && sb != NULL)
 		sbuf_bcat(sb, buf, size);
 	free(buf, M_TEMP);
 	*sizep = size;
 }
 
 #ifdef KINFO_PROC_SIZE
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #endif
 
 static void
 __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_numthreads *
 	    sizeof(elf_kinfo_proc_t);
 
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(elf_kinfo_proc_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
 	}
 	*sizep = size;
 }
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static void
 note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(struct kinfo_file);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1);
 	}
 }
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 static void
 note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(struct kinfo_vmentry);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb);
 	}
 }
 
 static void
 note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(gid_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
 		    sizeof(gid_t));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_fd->fd_cmask);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	struct rlimit rlim[RLIM_NLIMITS];
 	size_t size;
 	int structsize, i;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(rlim);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(rlim);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		for (i = 0; i < RLIM_NLIMITS; i++)
 			lim_rlimit(p, i, &rlim[i]);
 		PROC_UNLOCK(p);
 		sbuf_bcat(sb, rlim, sizeof(rlim));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_osrel);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_osrel);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	elf_ps_strings_t ps_strings;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(ps_strings);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(ps_strings);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		ps_strings = PTROUT(p->p_sysent->sv_psstrings);
 #else
 		ps_strings = p->p_sysent->sv_psstrings;
 #endif
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(Elf_Auxinfo);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 	}
 }
 
 static boolean_t
 __elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote,
     int32_t *osrel, const Elf_Phdr *pnote)
 {
 	const Elf_Note *note, *note0, *note_end;
 	const char *note_name;
 	int i;
 
 	if (pnote == NULL || pnote->p_offset > PAGE_SIZE ||
 	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset)
 		return (FALSE);
 
 	note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
 	note_end = (const Elf_Note *)(imgp->image_header +
 	    pnote->p_offset + pnote->p_filesz);
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
 		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
 		    (const char *)note < sizeof(Elf_Note))
 			return (FALSE);
 		if (note->n_namesz != checknote->hdr.n_namesz ||
 		    note->n_descsz != checknote->hdr.n_descsz ||
 		    note->n_type != checknote->hdr.n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
 		if (note_name + checknote->hdr.n_namesz >=
 		    (const char *)note_end || strncmp(checknote->vendor,
 		    note_name, checknote->hdr.n_namesz) != 0)
 			goto nextnote;
 
 		/*
 		 * Fetch the osreldate for binary
 		 * from the ELF OSABI-note if necessary.
 		 */
 		if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 &&
 		    checknote->trans_osrel != NULL)
 			return (checknote->trans_osrel(note, osrel));
 		return (TRUE);
 
 nextnote:
 		note = (const Elf_Note *)((const char *)(note + 1) +
 		    roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
 		    roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
 	}
 
 	return (FALSE);
 }
 
 /*
  * Try to find the appropriate ABI-note section for checknote,
  * fetch the osreldate for binary from the ELF OSABI-note. Only the
  * first page of the image is searched, the same as for headers.
  */
 static boolean_t
 __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote,
     int32_t *osrel)
 {
 	const Elf_Phdr *phdr;
 	const Elf_Ehdr *hdr;
 	int i;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_NOTE &&
 		    __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i]))
 			return (TRUE);
 	}
 	return (FALSE);
 
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	__CONCAT(exec_, __elfN(imgact)),
 	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
 
 #ifdef COMPRESS_USER_CORES
 /*
  * Compress and write out a core segment for a user process.
  *
  * 'inbuf' is the starting address of a VM segment in the process' address
  * space that is to be compressed and written out to the core file.  'dest_buf'
  * is a buffer in the kernel's address space.  The segment is copied from 
  * 'inbuf' to 'dest_buf' first before being processed by the compression
  * routine gzwrite().  This copying is necessary because the content of the VM
  * segment may change between the compression pass and the crc-computation pass
  * in gzwrite().  This is because realtime threads may preempt the UNIX kernel.
  *
  * If inbuf is NULL it is assumed that data is already copied to 'dest_buf'.
  */
 static int
 compress_core (gzFile file, char *inbuf, char *dest_buf, unsigned int len,
     struct thread *td)
 {
 	int len_compressed;
 	int error = 0;
 	unsigned int chunk_len;
 
 	while (len) {
 		if (inbuf != NULL) {
 			chunk_len = (len > CORE_BUF_SIZE) ? CORE_BUF_SIZE : len;
 			copyin(inbuf, dest_buf, chunk_len);
 			inbuf += chunk_len;
 		} else {
 			chunk_len = len;
 		}
 		len_compressed = gzwrite(file, dest_buf, chunk_len);
 
 		EVENTHANDLER_INVOKE(app_coredump_progress, td, len_compressed);
 
 		if ((unsigned int)len_compressed != chunk_len) {
 			log(LOG_WARNING,
 			    "compress_core: length mismatch (0x%x returned, "
 			    "0x%x expected)\n", len_compressed, chunk_len);
 			EVENTHANDLER_INVOKE(app_coredump_error, td,
 			    "compress_core: length mismatch %x -> %x",
 			    chunk_len, len_compressed);
 			error = EFAULT;
 			break;
 		}
 		len -= chunk_len;
 		maybe_yield();
 	}
 
 	return (error);
 }
 #endif /* COMPRESS_USER_CORES */
 
 static vm_prot_t
 __elfN(trans_prot)(Elf_Word flags)
 {
 	vm_prot_t prot;
 
 	prot = 0;
 	if (flags & PF_X)
 		prot |= VM_PROT_EXECUTE;
 	if (flags & PF_W)
 		prot |= VM_PROT_WRITE;
 	if (flags & PF_R)
 		prot |= VM_PROT_READ;
 #if __ELF_WORD_SIZE == 32
 #if defined(__amd64__) || defined(__ia64__)
 	if (i386_read_exec && (flags & PF_R))
 		prot |= VM_PROT_EXECUTE;
 #endif
 #endif
 	return (prot);
 }
 
 static Elf_Word
 __elfN(untrans_prot)(vm_prot_t prot)
 {
 	Elf_Word flags;
 
 	flags = 0;
 	if (prot & VM_PROT_EXECUTE)
 		flags |= PF_X;
 	if (prot & VM_PROT_READ)
 		flags |= PF_R;
 	if (prot & VM_PROT_WRITE)
 		flags |= PF_W;
 	return (flags);
 }
Index: stable/10/sys/kern/kern_descrip.c
===================================================================
--- stable/10/sys/kern/kern_descrip.c	(revision 284664)
+++ stable/10/sys/kern/kern_descrip.c	(revision 284665)
@@ -1,4110 +1,4116 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/pipe.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/unistd.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static uma_zone_t file_zone;
 
 void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, int holdleaders);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static int	fd_last_used(struct filedesc *fdp, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
 static int	fill_procdesc_info(struct procdesc *pdp,
 		    struct kinfo_file *kif);
 static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
 static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
 static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
 static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
 static int	getmaxfd(struct proc *p);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct filedescent *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	filedescent fd_dfiles[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx fdesc_mtx;
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  * not exceeding size - 1. Return -1 if not found.
  */
 static int
 fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 _fdfree(struct filedesc *fdp, int fd, int last)
 {
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	if (!last)
 		seq_write_begin(&fde->fde_seq);
 #endif
 	filecaps_free(&fde->fde_caps);
 	if (last)
 		return;
 	bzero(fde, fde_change_size);
 	fdunused(fdp, fd);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 
 	_fdfree(fdp, fd, 0);
 }
 
 static inline void
 fdfree_last(struct filedesc *fdp, int fd)
 {
 
 	_fdfree(fdp, fd, 1);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 	uint64_t lim;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(p);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
 		    td->td_retval));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error;
 
 	error = 0;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 		    cmd = F_GETLK;
 		    break;
 		case F_OSETLK:
 		    cmd = F_SETLK;
 		    break;
 		case F_OSETLKW:
 		    cmd = F_SETLKW;
 		    break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
         case F_GETLK:
         case F_SETLK:
         case F_SETLKW:
 	case F_SETLK_REMOTE:
                 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
                 arg1 = (intptr_t)&fl;
                 break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, cmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error, flg, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
 		    td->td_retval);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
 		    td->td_retval);
 		break;
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
 		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		td->td_retval[0] =
 		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
 		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		cap_rights_init(&rights, CAP_FLOCK);
 		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			/*
 			 * Temporary api for testing remote lock
 			 * infrastructure.
 			 */
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     foffset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct proc *p)
 {
 	int maxfd;
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	return (maxfd);
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 do_dup(struct thread *td, int flags, int old, int new,
     register_t *retval)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(p);
 	if (new >= maxfd)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	oldfde = &fdp->fd_ofiles[old];
 	if (flags & DUP_FIXED && old == new) {
 		*retval = new;
 		if (flags & DUP_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
 	fp = oldfde->fde_file;
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	if (flags & DUP_FIXED) {
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
-			PROC_LOCK(p);
-			error = racct_set(p, RACCT_NOFILE, new + 1);
-			PROC_UNLOCK(p);
-			if (error != 0) {
-				FILEDESC_XUNLOCK(fdp);
-				fdrop(fp, td);
-				return (EMFILE);
+			if (racct_enable) {
+				PROC_LOCK(p);
+				error = racct_set(p, RACCT_NOFILE, new + 1);
+				PROC_UNLOCK(p);
+				if (error != 0) {
+					FILEDESC_XUNLOCK(fdp);
+					fdrop(fp, td);
+					return (EMFILE);
+				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 			oldfde = &fdp->fd_ofiles[old];
 		}
 		newfde = &fdp->fd_ofiles[new];
 		if (newfde->fde_file == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 		newfde = &fdp->fd_ofiles[new];
 	}
 
 	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
 	KASSERT(old != new, ("new fd is same as old"));
 
 	delfp = newfde->fde_file;
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seq_write_begin(&newfde->fde_seq);
 #endif
 	filecaps_free(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 	if ((flags & DUP_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seq_write_end(&newfde->fde_seq);
 #endif
 	*retval = new;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, 1);
 		/* closefp() drops the FILEDESC lock for us. */
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		free(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Function drops the filedesc lock on return.
  */
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     int holdleaders)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = 0;
 		}
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 {
 	struct filedesc *fdp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (0);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrstat(sbp);
 #endif
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 	if (error != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Initialize filecaps structure.
  */
 void
 filecaps_init(struct filecaps *fcaps)
 {
 
 	bzero(fcaps, sizeof(*fcaps));
 	fcaps->fc_nioctls = -1;
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  */
 void
 filecaps_copy(const struct filecaps *src, struct filecaps *dst)
 {
 	size_t size;
 
 	*dst = *src;
 	if (src->fc_ioctls != NULL) {
 		KASSERT(src->fc_nioctls > 0,
 		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 	}
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct filedescent *ntable;
 	struct filedescent *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_ofiles;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the
 	 * file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data over and point at the new tables */
 	memcpy(ntable, otable, onfiles * sizeof(*otable));
 	fdp->fd_ofiles = ntable;
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * In order to have a valid pattern for fget_unlocked()
 	 * fdp->fd_nfiles must be the last member to be updated, otherwise
 	 * fget_unlocked() consumers may reference a new, higher value for
 	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
 	 * resulting in OOB accesses.
 	 */
 	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
 
 	/*
 	 * Do not free the old file table, as some threads may still
 	 * reference entries within it.  Instead, place it on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		ft = (struct freetable *)&otable[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
 		ft->ft_table = otable;
 		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(p);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (fd >= maxfd)
 		return (EMFILE);
 	if (fd >= fdp->fd_nfiles) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
-		PROC_LOCK(p);
-		error = racct_set(p, RACCT_NOFILE, allocfd);
-		PROC_UNLOCK(p);
-		if (error != 0)
-			return (EMFILE);
+		if (racct_enable) {
+			PROC_LOCK(p);
+			error = racct_set(p, RACCT_NOFILE, allocfd);
+			PROC_UNLOCK(p);
+			if (error != 0)
+				return (EMFILE);
+		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (!fdavail(td, n))
 		return (EMFILE);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors are available to the process
  * p.
  */
 int
 fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	/*
 	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
 	 *      call racct_add() from there instead of dealing with containers
 	 *      here.
 	 */
 	lim = getmaxfd(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
 	for (i = fdp->fd_freefile; i < last; i++) {
 		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file decriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
 {
 	struct file *fp;
 	int error, fd;
 
 	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);		/* no reference held on error */
 
 	error = finstall(td, fp, &fd, flags, NULL);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
 	}
 
 	if (resultfp != NULL)
 		*resultfp = fp;		/* copy out result */
 	else
 		fdrop(fp, td);		/* release local reference */
 
 	if (resultfd != NULL)
 		*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 falloc_noinstall(struct thread *td, struct file **resultfp)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
 		}
 		return (ENFILE);
 	}
 	atomic_add_int(&openfiles, 1);
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	*resultfp = fp;
 	return (0);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct filedescent *fde;
 	int error;
 
 	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
 	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	fhold(fp);
 	fde = &fdp->fd_ofiles[*fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fde->fde_file = fp;
 	if ((flags & O_CLOEXEC) != 0)
 		fde->fde_flags |= UF_EXCLOSE;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp)
 {
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
 	if (fdp != NULL) {
 		FILEDESC_SLOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 		if (newfdp->fd_fd.fd_rdir)
 			VREF(newfdp->fd_fd.fd_rdir);
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
 		FILEDESC_SUNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	int i;
 
 	mtx_lock(&fdesc_mtx);
 	i = --fdp->fd_holdcnt;
 	mtx_unlock(&fdesc_mtx);
 	if (i > 0)
 		return;
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	fdp0 = (struct filedesc0 *)fdp;
 	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
 		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
 		free(ft->ft_table, M_FILEDESC);
 	}
 	free(fdp, M_FILEDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	FILEDESC_XLOCK(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_XUNLOCK(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (p->p_fd->fd_refcnt == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
 	newfdp = fdinit(fdp);
 	FILEDESC_SLOCK(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 		FILEDESC_SUNLOCK(fdp);
 		FILEDESC_XLOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 		FILEDESC_XUNLOCK(newfdp);
 		FILEDESC_SLOCK(fdp);
 	}
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (fdisused(fdp, i) &&
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
 		    ofde->fde_file->f_ops != &badfileops) {
 			nfde = &newfdp->fd_ofiles[i];
 			*nfde = *ofde;
 			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
 			fhold(nfde->fde_file);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 		}
 	}
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	FILEDESC_XLOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
 		if (newfdp->fd_ofiles[i].fde_file != NULL)
 			fdused(newfdp, i);
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	FILEDESC_XUNLOCK(newfdp);
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdescfree(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
 	struct flock lf;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	racct_set(td->td_proc, RACCT_NOFILE, 0);
-	PROC_UNLOCK(td->td_proc);
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		racct_set(td->td_proc, RACCT_NOFILE, 0);
+		PROC_UNLOCK(td->td_proc);
+	}
 #endif
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
 		FILEDESC_XLOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 		    ("filedesc_to_refcount botch: fdl_refcount=%d",
 		    fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			for (i = 0; i <= fdp->fd_lastfile; i++) {
 				fp = fdp->fd_ofiles[i].fde_file;
 				if (fp == NULL || fp->f_type != DTYPE_VNODE)
 					continue;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
 				    &lf, F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
 			}
 		}
 	retry:
 		if (fdtol->fdl_refcount == 1) {
 			if (fdp->fd_holdleaderscount > 0 &&
 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 				/*
 				 * close() or do_dup() has cleared a reference
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
 				sx_sleep(&fdp->fd_holdleaderscount,
 				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
 				 * Ensure that fdtol->fdl_leader remains
 				 * valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
 				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 				    "fdlhold", 0);
 				goto retry;
 			}
 		}
 		fdtol->fdl_refcount--;
 		if (fdtol->fdl_refcount == 0 &&
 		    fdtol->fdl_holdcount == 0) {
 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
 		FILEDESC_XUNLOCK(fdp);
 		if (fdtol != NULL)
 			free(fdtol, M_FILEDESC_TO_LEADER);
 	}
 
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	FILEDESC_XLOCK(fdp);
 	i = --fdp->fd_refcnt;
 	if (i > 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return;
 	}
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL) {
 			fdfree_last(fdp, i);
 			(void) closef(fp, td);
 		}
 	}
 
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_ofiles, M_FILEDESC);
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 
 	if (cdir != NULL)
 		vrele(cdir);
 	if (rdir != NULL)
 		vrele(rdir);
 	if (jdir != NULL)
 		vrele(jdir);
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE) {
 		struct vnode *vp = fp->f_vnode;
 
 		if ((vp->v_vflag & VV_PROCDEP) != 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_XLOCK(fdp);
 		}
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, 0);
 			/* closefp() drops the FILEDESC lock. */
 			FILEDESC_XLOCK(fdp);
 		}
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t retval, save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 		if (devnull < 0) {
 			save = td->td_retval[0];
 			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
 			    O_RDWR, 0);
 			devnull = td->td_retval[0];
 			td->td_retval[0] = save;
 			if (error)
 				break;
 			KASSERT(devnull == i, ("oof, we didn't get our fd"));
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
 {
 #ifdef CAPABILITIES
 	struct filedescent fde;
 #endif
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
 	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	/*
 	 * Avoid reads reordering and then a first access to the
 	 * fdp->fd_ofiles table which could result in OOB operation.
 	 */
 	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seq_read(fd_seq(fdp, fd));
 		fde = fdp->fd_ofiles[fd];
 		if (!seq_consistent(fd_seq(fdp, fd), seq)) {
 			cpu_spinwait();
 			continue;
 		}
 		fp = fde.fde_file;
 #else
 		fp = fdp->fd_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		haverights = *cap_rights_fde(&fde);
 		if (needrightsp != NULL) {
 			error = cap_check(&haverights, needrightsp);
 			if (error != 0)
 				return (error);
 			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
 				error = cap_fcntl_check_fde(&fde, needfcntl);
 				if (error != 0)
 					return (error);
 			}
 		}
 #endif
 		count = fp->f_count;
 		if (count == 0)
 			continue;
 		/*
 		 * Use an acquire barrier to prevent caching of fd_ofiles
 		 * so it is refreshed for verification.
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
 			continue;
 #ifdef	CAPABILITIES
 		if (seq_consistent_nomb(fd_seq(fdp, fd), seq))
 #else
 		if (fp == fdp->fd_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (haverightsp != NULL) {
 #ifdef CAPABILITIES
 		*haverightsp = haverights;
 #else
 		CAP_ALL(haverightsp);
 #endif
 	}
 	return (0);
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occured the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp, u_char *maxprotp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	cap_rights_t haverights, needrights;
 	int error;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	if (needrightsp != NULL)
 		needrights = *needrightsp;
 	else
 		cap_rights_init(&needrights);
 	if (maxprotp != NULL)
 		cap_rights_set(&needrights, CAP_MMAP);
 	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&haverights);
 #else /* !CAPABILITIES */
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #endif /* CAPABILITIES */
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
     struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 #ifdef CAPABILITIES
 	int error;
 #endif
 
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 
 	fp = fget_locked(fdp, fd);
 	if (fp == NULL || fp->f_ops == &badfileops)
 		return (EBADF);
 
 #ifdef CAPABILITIES
 	if (needrightsp != NULL) {
 		error = cap_check(cap_rights(fdp, fd), needrightsp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 
 	if (fp->f_vnode == NULL)
 		return (EINVAL);
 
 	*vpp = fp->f_vnode;
 	vref(*vpp);
 	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
 
 	return (0);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if the
  * descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
  * in the future.
  *
  * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being free'd
  * during use.
  */
 int
 fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
     u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * Note: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	error = 0;
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fhold(fp);
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		bzero(oldfde, fde_change_size);
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_jdir == olddp) {
 			vref(newdp);
 			fdp->fd_jdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vref(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vref(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vref(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			if (p->p_state == PRS_NEW)
 				continue;
 			fdp = fdhold(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type,
     struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
 {
 	int error;
 	char *fullpath, *freepath;
 
 	bzero(kif, sizeof(*kif));
 	kif->kf_structsize = sizeof(*kif);
 
 	vref(vp);
 	kif->kf_fd = type;
 	kif->kf_type = KF_TYPE_VNODE;
 	/* This function only handles directories. */
 	if (vp->v_type != VDIR) {
 		vrele(vp);
 		return (ENOTDIR);
 	}
 	kif->kf_vnode_type = KF_VTYPE_VDIR;
 
 	/*
 	 * This is not a true file descriptor, so we set a bogus refcount
 	 * and offset to indicate these fields should be ignored.
 	 */
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 
 	freepath = NULL;
 	fullpath = "-";
 	FILEDESC_SUNLOCK(fdp);
 	vn_fullpath(curthread, vp, &fullpath, &freepath);
 	vrele(vp);
 	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 	error = SYSCTL_OUT(req, kif, sizeof(*kif));
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	char *fullpath, *freepath;
 	struct kinfo_ofile *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct shmfd *shmfd;
 	struct socket *so;
 	struct vnode *vp;
 	struct ksem *ks;
 	struct file *fp;
 	struct proc *p;
 	struct tty *tp;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 				fdp, req);
 	if (fdp->fd_rdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 				fdp, req);
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 				fdp, req);
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		bzero(kif, sizeof(*kif));
 		kif->kf_structsize = sizeof(*kif);
 		ks = NULL;
 		vp = NULL;
 		so = NULL;
 		tp = NULL;
 		shmfd = NULL;
 		kif->kf_fd = i;
 
 		switch (fp->f_type) {
 		case DTYPE_VNODE:
 			kif->kf_type = KF_TYPE_VNODE;
 			vp = fp->f_vnode;
 			break;
 
 		case DTYPE_SOCKET:
 			kif->kf_type = KF_TYPE_SOCKET;
 			so = fp->f_data;
 			break;
 
 		case DTYPE_PIPE:
 			kif->kf_type = KF_TYPE_PIPE;
 			break;
 
 		case DTYPE_FIFO:
 			kif->kf_type = KF_TYPE_FIFO;
 			vp = fp->f_vnode;
 			break;
 
 		case DTYPE_KQUEUE:
 			kif->kf_type = KF_TYPE_KQUEUE;
 			break;
 
 		case DTYPE_CRYPTO:
 			kif->kf_type = KF_TYPE_CRYPTO;
 			break;
 
 		case DTYPE_MQUEUE:
 			kif->kf_type = KF_TYPE_MQUEUE;
 			break;
 
 		case DTYPE_SHM:
 			kif->kf_type = KF_TYPE_SHM;
 			shmfd = fp->f_data;
 			break;
 
 		case DTYPE_SEM:
 			kif->kf_type = KF_TYPE_SEM;
 			ks = fp->f_data;
 			break;
 
 		case DTYPE_PTS:
 			kif->kf_type = KF_TYPE_PTS;
 			tp = fp->f_data;
 			break;
 
 #ifdef PROCDESC
 		case DTYPE_PROCDESC:
 			kif->kf_type = KF_TYPE_PROCDESC;
 			break;
 #endif
 
 		default:
 			kif->kf_type = KF_TYPE_UNKNOWN;
 			break;
 		}
 		kif->kf_ref_count = fp->f_count;
 		if (fp->f_flag & FREAD)
 			kif->kf_flags |= KF_FLAG_READ;
 		if (fp->f_flag & FWRITE)
 			kif->kf_flags |= KF_FLAG_WRITE;
 		if (fp->f_flag & FAPPEND)
 			kif->kf_flags |= KF_FLAG_APPEND;
 		if (fp->f_flag & FASYNC)
 			kif->kf_flags |= KF_FLAG_ASYNC;
 		if (fp->f_flag & FFSYNC)
 			kif->kf_flags |= KF_FLAG_FSYNC;
 		if (fp->f_flag & FNONBLOCK)
 			kif->kf_flags |= KF_FLAG_NONBLOCK;
 		if (fp->f_flag & O_DIRECT)
 			kif->kf_flags |= KF_FLAG_DIRECT;
 		if (fp->f_flag & FHASLOCK)
 			kif->kf_flags |= KF_FLAG_HASLOCK;
 		kif->kf_offset = foffset_get(fp);
 		if (vp != NULL) {
 			vref(vp);
 			switch (vp->v_type) {
 			case VNON:
 				kif->kf_vnode_type = KF_VTYPE_VNON;
 				break;
 			case VREG:
 				kif->kf_vnode_type = KF_VTYPE_VREG;
 				break;
 			case VDIR:
 				kif->kf_vnode_type = KF_VTYPE_VDIR;
 				break;
 			case VBLK:
 				kif->kf_vnode_type = KF_VTYPE_VBLK;
 				break;
 			case VCHR:
 				kif->kf_vnode_type = KF_VTYPE_VCHR;
 				break;
 			case VLNK:
 				kif->kf_vnode_type = KF_VTYPE_VLNK;
 				break;
 			case VSOCK:
 				kif->kf_vnode_type = KF_VTYPE_VSOCK;
 				break;
 			case VFIFO:
 				kif->kf_vnode_type = KF_VTYPE_VFIFO;
 				break;
 			case VBAD:
 				kif->kf_vnode_type = KF_VTYPE_VBAD;
 				break;
 			default:
 				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
 				break;
 			}
 			/*
 			 * It is OK to drop the filedesc lock here as we will
 			 * re-validate and re-evaluate its properties when
 			 * the loop continues.
 			 */
 			freepath = NULL;
 			fullpath = "-";
 			FILEDESC_SUNLOCK(fdp);
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vrele(vp);
 			strlcpy(kif->kf_path, fullpath,
 			    sizeof(kif->kf_path));
 			if (freepath != NULL)
 				free(freepath, M_TEMP);
 			FILEDESC_SLOCK(fdp);
 		}
 		if (so != NULL) {
 			struct sockaddr *sa;
 
 			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
 				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
 				free(sa, M_SONAME);
 			}
 			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
 			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
 				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
 				free(sa, M_SONAME);
 			}
 			kif->kf_sock_domain =
 			    so->so_proto->pr_domain->dom_family;
 			kif->kf_sock_type = so->so_type;
 			kif->kf_sock_protocol = so->so_proto->pr_protocol;
 		}
 		if (tp != NULL) {
 			strlcpy(kif->kf_path, tty_devname(tp),
 			    sizeof(kif->kf_path));
 		}
 		if (shmfd != NULL)
 			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
 		if (ks != NULL && ksem_info != NULL)
 			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
 		error = SYSCTL_OUT(req, kif, sizeof(*kif));
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 };
 
 static int
 export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
     int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
 {
 	struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 #define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
 	struct kinfo_file *kif;
 	struct vnode *vp;
 	int error, locked;
 	unsigned int i;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	kif = &efbuf->kif;
 	bzero(kif, sizeof(*kif));
 	locked = efbuf->fdp != NULL;
 	switch (type) {
 	case KF_TYPE_FIFO:
 	case KF_TYPE_VNODE:
 		if (locked) {
 			FILEDESC_SUNLOCK(efbuf->fdp);
 			locked = 0;
 		}
 		vp = (struct vnode *)data;
 		error = fill_vnode_info(vp, kif);
 		vrele(vp);
 		break;
 	case KF_TYPE_SOCKET:
 		error = fill_socket_info((struct socket *)data, kif);
 		break;
 	case KF_TYPE_PIPE:
 		error = fill_pipe_info((struct pipe *)data, kif);
 		break;
 	case KF_TYPE_PTS:
 		error = fill_pts_info((struct tty *)data, kif);
 		break;
 	case KF_TYPE_PROCDESC:
 		error = fill_procdesc_info((struct procdesc *)data, kif);
 		break;
 	case KF_TYPE_SEM:
 		error = fill_sem_info((struct file *)data, kif);
 		break;
 	case KF_TYPE_SHM:
 		error = fill_shm_info((struct file *)data, kif);
 		break;
 	default:
 		error = 0;
 	}
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 
 	/*
 	 * Translate file access flags.
 	 */
 	for (i = 0; i < NFFLAGS; i++)
 		if (fflags & fflags_table[i].fflag)
 			kif->kf_flags |=  fflags_table[i].kf_fflag;
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_type = type;
 	kif->kf_ref_count = refcnt;
 	kif->kf_offset = offset;
 	/* Pack record size down */
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			if (efbuf->fdp != NULL && !locked)
 				FILEDESC_SLOCK(efbuf->fdp);
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	if (locked)
 		FILEDESC_SUNLOCK(efbuf->fdp);
 	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM;
 	if (efbuf->fdp != NULL)
 		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int64_t offset;
 	void *data;
 	int error, i;
 	int type, refcnt, fflags;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vref(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vref(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vref(cttyvp);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	if (tracevp != NULL)
 		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
 		    FREAD | FWRITE, -1, -1, NULL, efbuf);
 	if (textvp != NULL)
 		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
 		    FREAD, -1, -1, NULL, efbuf);
 	if (cttyvp != NULL)
 		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
 		    FREAD | FWRITE, -1, -1, NULL, efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vref(fdp->fd_cdir);
 		data = fdp->fd_cdir;
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
 		    FREAD, -1, -1, NULL, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vref(fdp->fd_rdir);
 		data = fdp->fd_rdir;
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
 		    FREAD, -1, -1, NULL, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vref(fdp->fd_jdir);
 		data = fdp->fd_jdir;
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
 		    FREAD, -1, -1, NULL, efbuf);
 	}
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		data = NULL;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		cap_rights_init(&rights);
 #endif
 		switch (fp->f_type) {
 		case DTYPE_VNODE:
 			type = KF_TYPE_VNODE;
 			vref(fp->f_vnode);
 			data = fp->f_vnode;
 			break;
 
 		case DTYPE_SOCKET:
 			type = KF_TYPE_SOCKET;
 			data = fp->f_data;
 			break;
 
 		case DTYPE_PIPE:
 			type = KF_TYPE_PIPE;
 			data = fp->f_data;
 			break;
 
 		case DTYPE_FIFO:
 			type = KF_TYPE_FIFO;
 			vref(fp->f_vnode);
 			data = fp->f_vnode;
 			break;
 
 		case DTYPE_KQUEUE:
 			type = KF_TYPE_KQUEUE;
 			break;
 
 		case DTYPE_CRYPTO:
 			type = KF_TYPE_CRYPTO;
 			break;
 
 		case DTYPE_MQUEUE:
 			type = KF_TYPE_MQUEUE;
 			break;
 
 		case DTYPE_SHM:
 			type = KF_TYPE_SHM;
 			data = fp;
 			break;
 
 		case DTYPE_SEM:
 			type = KF_TYPE_SEM;
 			data = fp;
 			break;
 
 		case DTYPE_PTS:
 			type = KF_TYPE_PTS;
 			data = fp->f_data;
 			break;
 
 #ifdef PROCDESC
 		case DTYPE_PROCDESC:
 			type = KF_TYPE_PROCDESC;
 			data = fp->f_data;
 			break;
 #endif
 
 		default:
 			type = KF_TYPE_UNKNOWN;
 			break;
 		}
 		refcnt = fp->f_count;
 		fflags = fp->f_flag;
 		offset = foffset_get(fp);
 
 		/*
 		 * Create sysctl entry.
 		 * It is OK to drop the filedesc lock here as we will
 		 * re-validate and re-evaluate its properties when
 		 * the loop continues.
 		 */
 		error = export_fd_to_sb(data, type, i, fflags, refcnt,
 		    offset, &rights, efbuf);
 		if (error != 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 #define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < NVTYPES; i++)
 		if (vtypes_table[i].vtype == vtype)
 			break;
 	if (i < NVTYPES)
 		return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static int
 fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	if (vp == NULL)
 		return (1);
 	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp, 0);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	return (0);
 }
 
 static int
 fill_socket_info(struct socket *so, struct kinfo_file *kif)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	int error;
 
 	if (so == NULL)
 		return (1);
 	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
 	kif->kf_sock_type = so->so_type;
 	kif->kf_sock_protocol = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch(kif->kf_sock_domain) {
 	case AF_INET:
 	case AF_INET6:
 		if (kif->kf_sock_protocol == IPPROTO_TCP) {
 			if (so->so_pcb != NULL) {
 				inpcb = (struct inpcb *)(so->so_pcb);
 				kif->kf_un.kf_sock.kf_sock_inpcb =
 				    (uintptr_t)inpcb->inp_ppcb;
 			}
 		}
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 			}
 		}
 		break;
 	}
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
 	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
 		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
 	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
 		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	return (0);
 }
 
 static int
 fill_pts_info(struct tty *tp, struct kinfo_file *kif)
 {
 
 	if (tp == NULL)
 		return (1);
 	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
 	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
 	return (0);
 }
 
 static int
 fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
 {
 
 	if (pi == NULL)
 		return (1);
 	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
 	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
 	return (0);
 }
 
 static int
 fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
 {
 
 	if (pdp == NULL)
 		return (1);
 	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
 	return (0);
 }
 
 static int
 fill_sem_info(struct file *fp, struct kinfo_file *kif)
 {
 	struct thread *td;
 	struct stat sb;
 
 	td = curthread;
 	if (fp->f_data == NULL)
 		return (1);
 	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
 		return (1);
 	if (ksem_info == NULL)
 		return (1);
 	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
 	    &kif->kf_un.kf_sem.kf_sem_value);
 	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
 	return (0);
 }
 
 static int
 fill_shm_info(struct file *fp, struct kinfo_file *kif)
 {
 	struct thread *td;
 	struct stat sb;
 
 	td = curthread;
 	if (fp->f_data == NULL)
 		return (1);
 	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
 		return (1);
 	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
 	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
 	kif->kf_un.kf_file.kf_file_size = sb.st_size;
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 };
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Index: stable/10/sys/kern/kern_exit.c
===================================================================
--- stable/10/sys/kern/kern_exit.c	(revision 284664)
+++ stable/10/sys/kern/kern_exit.c	(revision 284665)
@@ -1,1360 +1,1362 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/pioctl.h>
 #include <sys/jail.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/sdt.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #include <sys/umtx.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exit;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, kernel, , exit, "int");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
 struct proc *
 proc_realparent(struct proc *child)
 {
 	struct proc *p, *parent;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
 		if (child->p_oppid == 0 ||
 		    child->p_pptr->p_pid == child->p_oppid)
 			parent = child->p_pptr;
 		else
 			parent = initproc;
 		return (parent);
 	}
 	for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
 		/* Cannot use LIST_PREV(), since the list head is not known. */
 		p = __containerof(p->p_orphan.le_prev, struct proc,
 		    p_orphan.le_next);
 		KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0,
 		    ("missing P_ORPHAN %p", p));
 	}
 	parent = __containerof(p->p_orphan.le_prev, struct proc,
 	    p_orphans.lh_first);
 	return (parent);
 }
 
 void
 reaper_abandon_children(struct proc *p, bool exiting)
 {
 	struct proc *p1, *p2, *ptmp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
 	if ((p->p_treeflag & P_TREE_REAPER) == 0)
 		return;
 	p1 = p->p_reaper;
 	LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
 		LIST_REMOVE(p2, p_reapsibling);
 		p2->p_reaper = p1;
 		p2->p_reapsubtree = p->p_reapsubtree;
 		LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
 		if (exiting && p2->p_pptr == p) {
 			PROC_LOCK(p2);
 			proc_reparent(p2, p1);
 			PROC_UNLOCK(p2);
 		}
 	}
 	KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
 	p->p_treeflag &= ~P_TREE_REAPER;
 }
 
 static void
 clear_orphan(struct proc *p)
 {
 	struct proc *p1;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
 		return;
 	if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
 		p1 = LIST_NEXT(p, p_orphan);
 		if (p1 != NULL)
 			p1->p_treeflag |= P_TREE_FIRST_ORPHAN;
 		p->p_treeflag &= ~P_TREE_FIRST_ORPHAN;
 	}
 	LIST_REMOVE(p, p_orphan);
 	p->p_treeflag &= ~P_TREE_ORPHANED;
 }
 
 /*
  * exit -- death of process.
  */
 void
 sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	exit1(td, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state to
  * zombie, and unlink proc from allproc and parent's lists.  Save exit status
  * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
 {
 	struct proc *p, *nq, *q, *t;
 	struct thread *tdt;
 	struct vnode *ttyvp = NULL;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	p = td->td_proc;
 	/*
 	 * XXX in case we're rebooting we just let init die in order to
 	 * work around an unsolved stack overflow seen very late during
 	 * shutdown on sparc64 when the gmirror worker process exists.
 	 */
 	if (p == initproc && rebooting == 0) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * Deref SU mp, since the thread does not return to userspace.
 	 */
 	if (softdep_ast_cleanup != NULL)
 		softdep_ast_cleanup();
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	/*
 	 * First check if some other thread or external request got
 	 * here before us.  If so, act appropriately: exit or suspend.
 	 * We must ensure that stop requests are handled before we set
 	 * P_WEXIT.
 	 */
 	thread_suspend_check(0);
 	while (p->p_flag & P_HADTHREADS) {
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from userret().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (!thread_single(p, SINGLE_EXIT))
 			/*
 			 * All other activity in this process is now
 			 * stopped.  Threading support has been turned
 			 * off.
 			 */
 			break;
 		/*
 		 * Recheck for new stop or suspend requests which
 		 * might appear while process lock was dropped in
 		 * thread_single().
 		 */
 		thread_suspend_check(0);
 	}
 	KASSERT(p->p_numthreads == 1,
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	racct_sub(p, RACCT_NTHR, 1);
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
 	 * released their reference to us.  Note that if they have
 	 * requested S_EXIT stops we will block here until they ack
 	 * via PIOCCONT.
 	 */
 	_STOPEVENT(p, S_EXIT, rv);
 
 	/*
 	 * Ignore any pending request to stop due to a stop signal.
 	 * Once P_WEXIT is set, future requests will be ignored as
 	 * well.
 	 */
 	p->p_flag &= ~P_STOPPED_SIG;
 	KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
 
 	/*
 	 * Note that we are exiting and do another wakeup of anyone in
 	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
 	wakeup(&p->p_stype);
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	p->p_xstat = rv;	/* Let event handler change exit status */
 	PROC_UNLOCK(p);
 	/* Drain the limit callout while we don't have the proc locked */
 	callout_drain(&p->p_limco);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			kern_psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	rv = p->p_xstat;	/* Event handler could change exit status */
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
 	 */
 	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
 		(*nlminfo_release_p)(p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdescfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	mtx_lock(&ppeers_lock);
 	if (p->p_leader->p_peers) {
 		q = p->p_leader;
 		while (q->p_peers != p)
 			q = q->p_peers;
 		q->p_peers = p->p_peers;
 		wakeup(p->p_leader);
 	}
 	mtx_unlock(&ppeers_lock);
 
 	vmspace_exit(td);
 
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp = p->p_session;
 		struct tty *tp;
 
 		/*
 		 * s_ttyp is not zero'd; we use this to indicate that
 		 * the session once had a controlling terminal. (for
 		 * logging and informational purposes)
 		 */
 		SESS_LOCK(sp);
 		ttyvp = sp->s_ttyvp;
 		tp = sp->s_ttyp;
 		sp->s_ttyvp = NULL;
 		sp->s_ttydp = NULL;
 		sp->s_leader = NULL;
 		SESS_UNLOCK(sp);
 
 		/*
 		 * Signal foreground pgrp and revoke access to
 		 * controlling terminal if it has not been revoked
 		 * already.
 		 *
 		 * Because the TTY may have been revoked in the mean
 		 * time and could already have a new session associated
 		 * with it, make sure we don't send a SIGHUP to a
 		 * foreground process group that does not belong to this
 		 * session.
 		 */
 
 		if (tp != NULL) {
 			tty_lock(tp);
 			if (tp->t_session == sp)
 				tty_signal_pgrp(tp, SIGHUP);
 			tty_unlock(tp);
 		}
 
 		if (ttyvp != NULL) {
 			sx_xunlock(&proctree_lock);
 			if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
 				VOP_REVOKE(ttyvp, REVOKEALL);
 				VOP_UNLOCK(ttyvp, 0);
 			}
 			sx_xlock(&proctree_lock);
 		}
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
 
 	/* Release the TTY now we've unlocked everything. */
 	if (ttyvp != NULL)
 		vrele(ttyvp);
 #ifdef KTRACE
 	ktrprocexit(td);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if (p->p_textvp != NULL) {
 		vrele(p->p_textvp);
 		p->p_textvp = NULL;
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	lim_free(p->p_limit);
 	p->p_limit = NULL;
 
 	tidhash_remove(td);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Call machine-dependent code to release any
 	 * machine-dependent resources other than the address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
 
 	/*
 	 * Reparent all children processes:
 	 * - traced ones to the original parent (or init if we are that parent)
 	 * - the rest to init
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(q->p_reaper);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		q->p_sigparent = SIGCHLD;
 
 		if (!(q->p_flag & P_TRACED)) {
 			proc_reparent(q, q->p_reaper);
 		} else {
 			/*
 			 * Traced processes are killed since their existence
 			 * means someone is screwing up.
 			 */
 			t = proc_realparent(q);
 			if (t == p) {
 				proc_reparent(q, q->p_reaper);
 			} else {
 				PROC_LOCK(t);
 				proc_reparent(q, t);
 				PROC_UNLOCK(t);
 			}
 			/*
 			 * Since q was found on our children list, the
 			 * proc_reparent() call moved q to the orphan
 			 * list due to present P_TRACED flag. Clear
 			 * orphan link for q now while q is locked.
 			 */
 			clear_orphan(q);
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
 			FOREACH_THREAD_IN_PROC(q, tdt)
 				tdt->td_dbgflags &= ~TDB_SUSPEND;
 			kern_psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Also get rid of our orphans.
 	 */
 	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
 		PROC_LOCK(q);
 		CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid,
 		    q->p_pid);
 		clear_orphan(q);
 		PROC_UNLOCK(q);
 	}
 
 	/* Save exit status. */
 	PROC_LOCK(p);
 	p->p_xthread = td;
 
 	/* Tell the prison that we are gone. */
 	prison_proc_free(p->p_ucred->cr_prison);
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exit if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exit)
 		dtrace_fasttrap_exit(p);
 #endif
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
 
 #ifdef KDTRACE_HOOKS
 	int reason = CLD_EXITED;
 	if (WCOREDUMP(rv))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(rv))
 		reason = CLD_KILLED;
 	SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
 #endif
 
 	/*
 	 * Just delete all entries in the p_klist. At this point we won't
 	 * report any more events, and there are nasty race conditions that
 	 * can beat us if we don't.
 	 */
 	knlist_clear(&p->p_klist, 1);
 
 	/*
 	 * If this is a process with a descriptor, we may not need to deliver
 	 * a signal to the parent.  proctree_lock is held over
 	 * procdesc_exit() to serialize concurrent calls to close() and
 	 * exit().
 	 */
 #ifdef PROCDESC
 	if (p->p_procdesc == NULL || procdesc_exit(p)) {
 #endif
 		/*
 		 * Notify parent that we're gone.  If parent has the
 		 * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
 		 * notify process 1 instead (and hope it will handle this
 		 * situation).
 		 */
 		PROC_LOCK(p->p_pptr);
 		mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 		if (p->p_pptr->p_sigacts->ps_flag &
 		    (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 			struct proc *pp;
 
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 			pp = p->p_pptr;
 			PROC_UNLOCK(pp);
 			proc_reparent(p, p->p_reaper);
 			p->p_sigparent = SIGCHLD;
 			PROC_LOCK(p->p_pptr);
 
 			/*
 			 * Notify parent, so in case he was wait(2)ing or
 			 * executing waitpid(2) with our pid, he will
 			 * continue.
 			 */
 			wakeup(pp);
 		} else
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 		if (p->p_pptr == p->p_reaper || p->p_pptr == initproc)
 			childproc_exited(p);
 		else if (p->p_sigparent != 0) {
 			if (p->p_sigparent == SIGCHLD)
 				childproc_exited(p);
 			else	/* LINUX thread */
 				kern_psignal(p->p_pptr, p->p_sigparent);
 		}
 #ifdef PROCDESC
 	} else
 		PROC_LOCK(p->p_pptr);
 #endif
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other proesses from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	cv_broadcast(&p->p_pwait);
 	sched_exit(p->p_pptr, td);
 	umtx_thread_exit(td);
 	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
 	 */
 	knlist_destroy(&p->p_klist);
 
 	/*
 	 * Save our children's rusage information in our exit rusage.
 	 */
 	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
 int
 sys_abort2(struct thread *td, struct abort2_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	void *uargs[16];
 	int error, i, sig;
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/*
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (uap->nargs < 0 || uap->nargs > 16)
 		goto out;
 	if (uap->nargs > 0) {
 		if (uap->args == NULL)
 			goto out;
 		error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
 		if (error != 0)
 			goto out;
 	}
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (uap->why != NULL) {
 		error = sbuf_copyin(sb, uap->why, 128);
 		if (error < 0)
 			goto out;
 	} else {
 		sbuf_printf(sb, "(null)");
 	}
 	if (uap->nargs > 0) {
 		sbuf_printf(sb, "(");
 		for (i = 0;i < uap->nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_printf(sb, ")");
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
 		sbuf_printf(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	exit1(td, W_EXITCODE(0, sig));
 	return (0);
 }
 
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 sys_wait4(struct thread *td, struct wait4_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 sys_wait6(struct thread *td, struct wait6_args *uap)
 {
 	struct __wrusage wru, *wrup;
 	siginfo_t si, *sip;
 	idtype_t idtype;
 	id_t id;
 	int error, status;
 
 	idtype = uap->idtype;
 	id = uap->id;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 
 	/*
 	 *  We expect all callers of wait6() to know about WEXITED and
 	 *  WTRAPPED.
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));
 	return (error);
 }
 
 /*
  * Reap the remains of a zombie process and optionally return status and
  * rusage.  Asserts and will release both the proctree_lock and the process
  * lock as part of its work.
  */
 void
 proc_reap(struct thread *td, struct proc *p, int *status, int options)
 {
 	struct proc *q, *t;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
 
 	q = td->td_proc;
 
 	PROC_SUNLOCK(p);
 	td->td_retval[0] = p->p_pid;
 	if (status)
 		*status = p->p_xstat;	/* convert to int */
 	if (options & WNOWAIT) {
 		/*
 		 *  Only poll, returning the status.  Caller does not wish to
 		 * release the proc struct just yet.
 		 */
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	PROC_LOCK(q);
 	sigqueue_take(p->p_ksi);
 	PROC_UNLOCK(q);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we got the child via a ptrace 'attach', we need to give it back
 	 * to the old parent.
 	 */
 	if (p->p_oppid != 0) {
 		t = proc_realparent(p);
 		PROC_LOCK(t);
 		PROC_LOCK(p);
 		CTR2(KTR_PTRACE,
 		    "wait: traced child %d moved back to parent %d", p->p_pid,
 		    t->p_pid);
 		proc_reparent(p, t);
 		p->p_oppid = 0;
 		PROC_UNLOCK(p);
 		pksignal(t, SIGCHLD, p->p_ksi);
 		wakeup(t);
 		cv_broadcast(&p->p_pwait);
 		PROC_UNLOCK(t);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	/*
 	 * Remove other references to this process to ensure we have an
 	 * exclusive reference.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);	/* off zombproc */
 	sx_xunlock(&allproc_lock);
 	LIST_REMOVE(p, p_sibling);
 	reaper_abandon_children(p, true);
 	LIST_REMOVE(p, p_reapsibling);
 	PROC_LOCK(p);
 	clear_orphan(p);
 	PROC_UNLOCK(p);
 	leavepgrp(p);
 #ifdef PROCDESC
 	if (p->p_procdesc != NULL)
 		procdesc_reap(p);
 #endif
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * As a side effect of this lock, we know that all other writes to
 	 * this proc are visible now, so no more locking is needed for p.
 	 */
 	PROC_LOCK(p);
 	p->p_xstat = 0;		/* XXX: why? */
 	PROC_UNLOCK(p);
 	PROC_LOCK(q);
 	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
 	PROC_UNLOCK(q);
 
 	/*
 	 * Decrement the count of procs running with this uid.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 	/*
 	 * Destroy resource accounting information associated with the process.
 	 */
 #ifdef RACCT
-	PROC_LOCK(p);
-	racct_sub(p, RACCT_NPROC, 1);
-	PROC_UNLOCK(p);
+	if (racct_enable) {
+		PROC_LOCK(p);
+		racct_sub(p, RACCT_NPROC, 1);
+		PROC_UNLOCK(p);
+	}
 #endif
 	racct_proc_exit(p);
 
 	/*
 	 * Free credentials, arguments, and sigacts.
 	 */
 	crfree(p->p_ucred);
 	p->p_ucred = NULL;
 	pargs_drop(p->p_args);
 	p->p_args = NULL;
 	sigacts_free(p->p_sigacts);
 	p->p_sigacts = NULL;
 
 	/*
 	 * Do any thread-system specific cleanups.
 	 */
 	thread_wait(p);
 
 	/*
 	 * Give vm and machine-dependent layer a chance to free anything that
 	 * cpu_exit couldn't release while still running in process context.
 	 */
 	vm_waitproc(p);
 #ifdef MAC
 	mac_proc_destroy(p);
 #endif
 	KASSERT(FIRST_THREAD_IN_PROC(p),
 	    ("proc_reap: no residual thread!"));
 	uma_zfree(proc_zone, p);
 	sx_xlock(&allproc_lock);
 	nprocs--;
 	sx_xunlock(&allproc_lock);
 }
 
 static int
 proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
     int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct rusage *rup;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 
 	PROC_LOCK(p);
 
 	switch (idtype) {
 	case P_ALL:
 		break;
 	case P_PID:
 		if (p->p_pid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_PGID:
 		if (p->p_pgid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_SID:
 		if (p->p_session->s_sid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_UID:
 		if (p->p_ucred->cr_uid != (uid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_GID:
 		if (p->p_ucred->cr_gid != (gid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_JAILID:
 		if (p->p_ucred->cr_prison->pr_id != (int)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	/*
 	 * It seems that the thread structures get zeroed out
 	 * at process exit.  This makes it impossible to
 	 * support P_SETID, P_CID or P_CPUID.
 	 */
 	default:
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (p_canwait(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	/*
 	 * This special case handles a kthread spawned by linux_clone
 	 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 	 * functions need to be able to distinguish between waiting
 	 * on a process and waiting on a thread.  It is a thread if
 	 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 	 * signifies we want to wait for threads and not processes.
 	 */
 	if ((p->p_sigparent != SIGCHLD) ^
 	    ((options & WLINUXCLONE) != 0)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	PROC_SLOCK(p);
 
 	if (siginfo != NULL) {
 		bzero(siginfo, sizeof(*siginfo));
 		siginfo->si_errno = 0;
 
 		/*
 		 * SUSv4 requires that the si_signo value is always
 		 * SIGCHLD. Obey it despite the rfork(2) interface
 		 * allows to request other signal for child exit
 		 * notification.
 		 */
 		siginfo->si_signo = SIGCHLD;
 
 		/*
 		 *  This is still a rough estimate.  We will fix the
 		 *  cases TRAPPED, STOPPED, and CONTINUED later.
 		 */
 		if (WCOREDUMP(p->p_xstat)) {
 			siginfo->si_code = CLD_DUMPED;
 			siginfo->si_status = WTERMSIG(p->p_xstat);
 		} else if (WIFSIGNALED(p->p_xstat)) {
 			siginfo->si_code = CLD_KILLED;
 			siginfo->si_status = WTERMSIG(p->p_xstat);
 		} else {
 			siginfo->si_code = CLD_EXITED;
 			siginfo->si_status = WEXITSTATUS(p->p_xstat);
 		}
 
 		siginfo->si_pid = p->p_pid;
 		siginfo->si_uid = p->p_ucred->cr_uid;
 
 		/*
 		 * The si_addr field would be useful additional
 		 * detail, but apparently the PC value may be lost
 		 * when we reach this point.  bzero() above sets
 		 * siginfo->si_addr to NULL.
 		 */
 	}
 
 	/*
 	 * There should be no reason to limit resources usage info to
 	 * exited processes only.  A snapshot about any resources used
 	 * by a stopped process may be exactly what is needed.
 	 */
 	if (wrusage != NULL) {
 		rup = &wrusage->wru_self;
 		*rup = p->p_ru;
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 
 		rup = &wrusage->wru_children;
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 	}
 
 	if (p->p_state == PRS_ZOMBIE) {
 		proc_reap(td, p, status, options);
 		return (-1);
 	}
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (1);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct __wrusage wru, *wrup;
 	idtype_t idtype;
 	id_t id;
 	int ret;
 
 	/*
 	 * Translate the special pid values into the (idtype, pid)
 	 * pair for kern_wait6.  The WAIT_MYPGRP case is handled by
 	 * kern_wait6() on its own.
 	 */
 	if (pid == WAIT_ANY) {
 		idtype = P_ALL;
 		id = 0;
 	} else if (pid < 0) {
 		idtype = P_PGID;
 		id = (id_t)-pid;
 	} else {
 		idtype = P_PID;
 		id = (id_t)pid;
 	}
 
 	if (rusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	/*
 	 * For backward compatibility we implicitly add flags WEXITED
 	 * and WTRAPPED here.
 	 */
 	options |= WEXITED | WTRAPPED;
 	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
 	if (rusage != NULL)
 		*rusage = wru.wru_self;
 	return (ret);
 }
 
 int
 kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
     int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct proc *p, *q;
 	int error, nfound, ret;
 
 	AUDIT_ARG_VALUE((int)idtype);	/* XXX - This is likely wrong! */
 	AUDIT_ARG_PID((pid_t)id);	/* XXX - This may be wrong! */
 	AUDIT_ARG_VALUE(options);
 
 	q = td->td_proc;
 
 	if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
 		PROC_LOCK(q);
 		id = (id_t)q->p_pgid;
 		PROC_UNLOCK(q);
 		idtype = P_PGID;
 	}
 
 	/* If we don't know the option, just return. */
 	if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
 	    WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
 		return (EINVAL);
 	if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
 		/*
 		 * We will be unable to find any matching processes,
 		 * because there are no known events to look for.
 		 * Prefer to return error instead of blocking
 		 * indefinitely.
 		 */
 		return (EINVAL);
 	}
 
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
 			nfound++;
 		else
 			return (0);
 
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 
 		if ((options & WTRAPPED) != 0 &&
 		    (p->p_flag & P_TRACED) != 0 &&
 		    (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xstat);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xstat;
 				siginfo->si_code = CLD_TRAPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			CTR4(KTR_PTRACE,
 	    "wait: returning trapped pid %d status %#x (xstat %d) xthread %d",
 			    p->p_pid, W_STOPCODE(p->p_xstat), p->p_xstat,
 			    p->p_xthread != NULL ? p->p_xthread->td_tid : -1);
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		if ((options & WUNTRACED) != 0 &&
 		    (p->p_flag & P_STOPPED_SIG) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xstat);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xstat;
 				siginfo->si_code = CLD_STOPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		PROC_SUNLOCK(p);
 		if ((options & WCONTINUED) != 0 &&
 		    (p->p_flag & P_CONTINUED) != 0) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			if ((options & WNOWAIT) == 0) {
 				p->p_flag &= ~P_CONTINUED;
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 			PROC_UNLOCK(p);
 
 			if (status != NULL)
 				*status = SIGCONT;
 			if (siginfo != NULL) {
 				siginfo->si_status = SIGCONT;
 				siginfo->si_code = CLD_CONTINUED;
 			}
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Look in the orphans list too, to allow the parent to
 	 * collect it's child exit status even if child is being
 	 * debugged.
 	 *
 	 * Debugger detaches from the parent upon successful
 	 * switch-over from parent to child.  At this point due to
 	 * re-parenting the parent loses the child to debugger and a
 	 * wait4(2) call would report that it has no children to wait
 	 * for.  By maintaining a list of orphans we allow the parent
 	 * to successfully wait until the child becomes a zombie.
 	 */
 	LIST_FOREACH(p, &q->p_orphans, p_orphan) {
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
 			nfound++;
 		else
 			return (0);
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return (error);
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 
 	clear_orphan(child);
 	if (child->p_flag & P_TRACED) {
 		if (LIST_EMPTY(&child->p_pptr->p_orphans)) {
 			child->p_treeflag |= P_TREE_FIRST_ORPHAN;
 			LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child,
 			    p_orphan);
 		} else {
 			LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans),
 			    child, p_orphan);
 		}
 		child->p_treeflag |= P_TREE_ORPHANED;
 	}
 
 	child->p_pptr = parent;
 }
Index: stable/10/sys/kern/kern_jail.c
===================================================================
--- stable/10/sys/kern/kern_jail.c	(revision 284664)
+++ stable/10/sys/kern/kern_jail.c	(revision 284665)
@@ -1,4761 +1,4775 @@
 /*-
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #endif /* INET6 */
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_ALL,
 };
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int do_jail_attach(struct thread *td, struct prison *pr);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 static void prison_remove_one(struct prison *pr);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 #ifdef INET
 static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
 static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
 #endif
 #ifdef INET6
 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
 static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01
 #define	PD_DEUREF	0x02
 #define	PD_LOCKED	0x04
 #define	PD_LIST_SLOCKED	0x08
 #define	PD_LIST_XLOCKED	0x10
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static char *pr_flag_names[] = {
 	[0] = "persist",
 #ifdef INET
 	[7] = "ip4.saddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.saddrsel",
 #endif
 };
 const size_t pr_flag_names_size = sizeof(pr_flag_names);
 
 static char *pr_flag_nonames[] = {
 	[0] = "nopersist",
 #ifdef INET
 	[7] = "ip4.nosaddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.nosaddrsel",
 #endif
 };
 const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
 
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 } pr_flag_jailsys[] = {
 	{ "host", 0, PR_HOST },
 #ifdef VIMAGE
 	{ "vnet", 0, PR_VNET },
 #endif
 #ifdef INET
 	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
 #endif
 #ifdef INET6
 	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 static char *pr_allow_names[] = {
 	"allow.set_hostname",
 	"allow.sysvipc",
 	"allow.raw_sockets",
 	"allow.chflags",
 	"allow.mount",
 	"allow.quotas",
 	"allow.socket_af",
 	"allow.mount.devfs",
 	"allow.mount.nullfs",
 	"allow.mount.zfs",
 	"allow.mount.procfs",
 	"allow.mount.tmpfs",
 	"allow.mount.fdescfs",
 };
 const size_t pr_allow_names_size = sizeof(pr_allow_names);
 
 static char *pr_allow_nonames[] = {
 	"allow.noset_hostname",
 	"allow.nosysvipc",
 	"allow.noraw_sockets",
 	"allow.nochflags",
 	"allow.nomount",
 	"allow.noquotas",
 	"allow.nosocket_af",
 	"allow.mount.nodevfs",
 	"allow.mount.nonullfs",
 	"allow.mount.nozfs",
 	"allow.mount.noprocfs",
 	"allow.mount.notmpfs",
 	"allow.mount.nofdescfs",
 };
 const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
 
 #define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 }
 
 #ifdef INET
 static int
 qcmp_v4(const void *ip1, const void *ip2)
 {
 	in_addr_t iaa, iab;
 
 	/*
 	 * We need to compare in HBO here to get the list sorted as expected
 	 * by the result of the code.  Sorting NBO addresses gives you
 	 * interesting results.  If you do not understand, do not try.
 	 */
 	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
 	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
 
 	/*
 	 * Do not simply return the difference of the two numbers, the int is
 	 * not wide enough.
 	 */
 	if (iaa > iab)
 		return (1);
 	else if (iaa < iab)
 		return (-1);
 	else
 		return (0);
 }
 #endif
 
 #ifdef INET6
 static int
 qcmp_v6(const void *ip1, const void *ip2)
 {
 	const struct in6_addr *ia6a, *ia6b;
 	int i, rc;
 
 	ia6a = (const struct in6_addr *)ip1;
 	ia6b = (const struct in6_addr *)ip2;
 
 	rc = 0;
 	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
 		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
 			rc = 1;
 		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
 			rc = -1;
 	}
 	return (rc);
 }
 #endif
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4
 			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs, fi;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (fi = 0; fi < sizeof(pr_allow_names) /
 		     sizeof(pr_allow_names[0]); fi++) {
 			optiov[opt.uio_iovcnt].iov_base =
 			    (jail_default_allow & (1 << fi))
 			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
 	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct in_addr *ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 #if defined(INET) || defined(INET6)
 	struct prison *tppr;
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen;
 	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int fi, jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 	int fullpath_disabled;
 #if defined(INET) || defined(INET6)
 	int ii, ij;
 #endif
 #ifdef INET
 	int ip4s, redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s, redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = ppr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
 	    fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
 		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
 	}
 	ch_flags |= pr_flags;
 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
 	    fi++) {
 		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
 		    sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!pr_flag_jailsys[fi].disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= pr_flag_jailsys[fi].disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= pr_flag_jailsys[fi].new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |=
 		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
 	    fi++) {
 		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
 		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(*ip4) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
 		if (ip4s == 0)
 			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
 		else {
 			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
 			ip4s /= sizeof(*ip4);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 			bcopy(op, ip4, ip4s * sizeof(*ip4));
 			/*
 			 * IP addresses are all sorted but ip[0] to preserve
 			 * the primary IP address as given from userland.
 			 * This special IP is used for unbound outgoing
 			 * connections as well for "loopback" traffic in case
 			 * source address selection cannot find any more fitting
 			 * address to connect from.
 			 */
 			if (ip4s > 1)
 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
 			/*
 			 * Check for duplicate addresses and do some simple
 			 * zero and broadcast checks. If users give other bogus
 			 * addresses it is their problem.
 			 *
 			 * We do not have to care about byte order for these
 			 * checks so we will do them in NBO.
 			 */
 			for (ii = 0; ii < ip4s; ii++) {
 				if (ip4[ii].s_addr == INADDR_ANY ||
 				    ip4[ii].s_addr == INADDR_BROADCAST) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip4s &&
 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(*ip6) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
 		if (ip6s == 0)
 			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
 		else {
 			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
 			ip6s /= sizeof(*ip6);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 			bcopy(op, ip6, ip6s * sizeof(*ip6));
 			if (ip6s > 1)
 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
 			for (ii = 0; ii < ip6s; ii++) {
 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip6s &&
 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
 				{
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	fullpath_disabled = 0;
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    path, td);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0)
 			path = g_path;
 		else if (error == ENODEV) {
 			/* proceed if sysctl debug.disablefullpath == 1 */
 			fullpath_disabled = 1;
 			if (len < 2 || (len == 2 && path[0] == '/'))
 				path = NULL;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root, 0);
 		if (fullpath_disabled) {
 			/* Leave room for a real-root full pathname. */
 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
 				error = ENAMETOOLONG;
 				goto done_free;
 			}
 		}
 	}
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || len >= OSRELEASELEN) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	/*
 	 * Grab the allprison lock before letting modules check their
 	 * parameters.  Once we have it, do not let go so we'll have a
 	 * consistent view of the OSD list.
 	 */
 	sx_xlock(&allprison_lock);
 	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
 	if (error)
 		goto done_unlock_list;
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_unlock_list;
 		}
 	}
 
 	/*
 	 * See if we are creating a new record or updating an existing one.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_unlock_list;
 	}
 	pr = NULL;
 	namelc = NULL;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	if (jid != 0) {
 		/*
 		 * See if a requested jid already exists.  There is an
 		 * information leak here if the jid exists but is not within
 		 * the caller's jail hierarchy.  Jail creators will get EEXIST
 		 * even though they cannot see the jail, and CREATE | UPDATE
 		 * will return ENOENT which is not normally a valid error.
 		 */
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_unlock_list;
 		}
 		pr = prison_find(jid);
 		if (pr != NULL) {
 			ppr = pr->pr_parent;
 			/* Create: jid must not exist. */
 			if (cuflags == JAIL_CREATE) {
 				mtx_unlock(&pr->pr_mtx);
 				error = EEXIST;
 				vfs_opterror(opts, "jail %d already exists",
 				    jid);
 				goto done_unlock_list;
 			}
 			if (!prison_ischild(mypr, pr)) {
 				mtx_unlock(&pr->pr_mtx);
 				pr = NULL;
 			} else if (pr->pr_uref == 0) {
 				if (!(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				} else if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
 					 * (via attach or persist), but first
 					 * it must determine if another jail
 					 * has claimed its name.  Accomplish
 					 * this by implicitly re-setting the
 					 * name.
 					 */
 					if (name == NULL)
 						name = prison_name(mypr, pr);
 				}
 			}
 		}
 		if (pr == NULL) {
 			/* Update: jid must exist. */
 			if (cuflags == JAIL_UPDATE) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			*namelc = '\0';
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					mtx_unlock(&pr->pr_mtx);
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_unlock_list;
 				}
 			} else {
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_unlock_list;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 			}
 			name = ++namelc;
 		}
 		if (name[0] != '\0') {
 			namelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
  name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr && tpr->pr_ref > 0 &&
 				    !strcmp(tpr->pr_name + namelen, name)) {
 					if (pr == NULL &&
 					    cuflags != JAIL_CREATE) {
 						mtx_lock(&tpr->pr_mtx);
 						if (tpr->pr_ref > 0) {
 							/*
 							 * Use this jail
 							 * for updates.
 							 */
 							if (tpr->pr_uref > 0) {
 								pr = tpr;
 								break;
 							}
 							deadpr = tpr;
 						}
 						mtx_unlock(&tpr->pr_mtx);
 					} else if (tpr->pr_uref > 0) {
 						/*
 						 * Create, or update(jid):
 						 * name must not exist in an
 						 * active sibling jail.
 						 */
 						error = EEXIST;
 						if (pr != NULL)
 							mtx_unlock(&pr->pr_mtx);
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_unlock_list;
 					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
 					mtx_lock(&deadpr->pr_mtx);
 					if (deadpr->pr_ref == 0) {
 						mtx_unlock(&deadpr->pr_mtx);
 						goto name_again;
 					}
 					pr = deadpr;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_unlock_list;
 				}
 			}
 			/* Update: name must exist if no jid. */
 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_unlock_list;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	if (pr == NULL) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_unlock_list;
 			}
 		created = 1;
 		mtx_lock(&ppr->pr_mtx);
 		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
 			mtx_unlock(&ppr->pr_mtx);
 			error = ENOENT;
 			vfs_opterror(opts, "parent jail went away!");
 			goto done_unlock_list;
 		}
 		ppr->pr_ref++;
 		ppr->pr_uref++;
 		mtx_unlock(&ppr->pr_mtx);
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 		if (jid == 0) {
 			/* Find the next free jid. */
 			jid = lastprid + 1;
  findnext:
 			if (jid == JAIL_MAX)
 				jid = 1;
 			TAILQ_FOREACH(tpr, &allprison, pr_list) {
 				if (tpr->pr_id < jid)
 					continue;
 				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 				if (jid == lastprid) {
 					error = EAGAIN;
 					vfs_opterror(opts,
 					    "no available jail IDs");
 					free(pr, M_PRISON);
 					prison_deref(ppr, PD_DEREF |
 					    PD_DEUREF | PD_LIST_XLOCKED);
 					goto done_releroot;
 				}
 				jid++;
 				goto findnext;
 			}
 			lastprid = jid;
 		} else {
 			/*
 			 * The jail already has a jid (that did not yet exist),
 			 * so just find where to insert it.
 			 */
 			TAILQ_FOREACH(tpr, &allprison, pr_list)
 				if (tpr->pr_id >= jid) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 		}
 		if (tpr == NULL)
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		pr->pr_parent = ppr;
 		pr->pr_id = jid;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (name == NULL)
 			name = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |=
 				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				if (ppr->pr_ip4 != NULL) {
 					pr->pr_ip4s = ppr->pr_ip4s;
 					pr->pr_ip4 = malloc(pr->pr_ip4s *
 					    sizeof(struct in_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip4, pr->pr_ip4,
 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 				}
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |=
 				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				if (ppr->pr_ip6 != NULL) {
 					pr->pr_ip6s = ppr->pr_ip6s;
 					pr->pr_ip6 = malloc(pr->pr_ip6s *
 					    sizeof(struct in6_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip6, pr->pr_ip6,
 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 				}
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
 		else
 		    strcpy(pr->pr_osrelease, osrelstr);
 
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
 		pr->pr_vnet = (pr_flags & PR_VNET)
 		    ? vnet_alloc() : ppr->pr_vnet;
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an erorr.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error) {
 			prison_deref(pr, PD_LIST_XLOCKED);
 			goto done_releroot;
 		}
 
 		mtx_lock(&pr->pr_mtx);
 		/*
 		 * New prisons do not yet have a reference, because we do not
 		 * want other to see the incomplete prison once the
 		 * allprison_lock is downgraded.
 		 */
 	} else {
 		created = 0;
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		pr->pr_ref++;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref_locked;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref_locked;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if (ppr->pr_flags & PR_IP4) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.  Don't worry
 			 * about the parent being unlocked, as any
 			 * setting is done with allprison_lock held.
 			 */
 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 					break;
 			if (ij == ppr->pr_ip4s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip4s > 1) {
 				for (ii = ij = 1; ii < ip4s; ii++) {
 					if (ip4[ii].s_addr ==
 					    ppr->pr_ip4[0].s_addr)
 						continue;
 					for (; ij < ppr->pr_ip4s; ij++)
 						if (ip4[ii].s_addr ==
 						    ppr->pr_ip4[ij].s_addr)
 							break;
 					if (ij == ppr->pr_ip4s)
 						break;
 				}
 				if (ij == ppr->pr_ip4s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/*
 		 * Check for conflicting IP addresses.  We permit them
 		 * if there is no more than one IP on each jail.  If
 		 * there is a duplicate on a jail with more than one
 		 * IP stop checking and return error.
 		 */
 		tppr = ppr;
 #ifdef VIMAGE
 		for (; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP4_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip4 == NULL ||
 			    (ip4s == 1 && tpr->pr_ip4s == 1))
 				continue;
 			for (ii = 0; ii < ip4s; ii++) {
 				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv4 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if (ppr->pr_flags & PR_IP6) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.
 			 */
 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
 				    &ppr->pr_ip6[ij]))
 					break;
 			if (ij == ppr->pr_ip6s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip6s > 1) {
 				for (ii = ij = 1; ii < ip6s; ii++) {
 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
 					     &ppr->pr_ip6[0]))
 						continue;
 					for (; ij < ppr->pr_ip6s; ij++)
 						if (IN6_ARE_ADDR_EQUAL(
 						    &ip6[ii], &ppr->pr_ip6[ij]))
 							break;
 					if (ij == ppr->pr_ip6s)
 						break;
 				}
 				if (ij == ppr->pr_ip6s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/* Check for conflicting IP addresses. */
 		tppr = ppr;
 #ifdef VIMAGE
 		for (; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP6_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip6 == NULL ||
 			    (ip6s == 1 && tpr->pr_ip6s == 1))
 				continue;
 			for (ii = 0; ii < ip6s; ii++) {
 				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv6 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (name != NULL) {
 		/* Give a default name of the jid. */
 		if (name[0] == '\0')
 			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
 		else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
 		    *p == '\0')) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref_locked;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		onamelen = strlen(pr->pr_name);
 		namelen = strlen(name);
 		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref_locked;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref_locked;
 			}
 		}
 	}
 	if (pr_allow & ~ppr->pr_allow) {
 		error = EPERM;
 		goto done_deref_locked;
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = 0;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		free(pr->pr_ip4, M_PRISON);
 		pr->pr_ip4s = ip4s;
 		pr->pr_ip4 = ip4;
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, NULL)) {
 				redo_ip4 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = 0;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		free(pr->pr_ip6, M_PRISON);
 		pr->pr_ip6s = ip6s;
 		pr->pr_ip6 = ip6;
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, NULL)) {
 				redo_ip6 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (name != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, name);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		if (fullpath_disabled && path[0] == '/' &&
 		    strcmp(mypr->pr_path, "/"))
 			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
 			    mypr->pr_path, path);
 		else
 			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	if ((tallow = ch_allow & ~pr_allow)) {
 		/* Clear allow bits in all children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_allow &= ~tallow;
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.  Only do this for existing prisons
 	 * for now, so new ones will remain unseen until after the module
 	 * handlers have completed.
 	 */
 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 		if (pr_flags & PR_PERSIST) {
 			pr->pr_ref++;
 			pr->pr_uref++;
 		} else {
 			pr->pr_ref--;
 			pr->pr_uref--;
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	mtx_unlock(&pr->pr_mtx);
 
 #ifdef RACCT
-	if (created)
+	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_ip4s;
 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, ip4)) {
 				if (ip4 != NULL)
 					ip4 = NULL;
 				else
 					redo_ip4 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_ip6s;
 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, ip6)) {
 				if (ip6 != NULL)
 					ip6 = NULL;
 				else
 					redo_ip6 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
 	sx_downgrade(&allprison_lock);
 	if (created) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
 			prison_deref(pr, PD_LIST_SLOCKED);
 			goto done_errmsg;
 		}
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error) {
 		prison_deref(pr, created
 		    ? PD_LIST_SLOCKED
 		    : PD_DEREF | PD_LIST_SLOCKED);
 		goto done_errmsg;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
 		mtx_lock(&pr->pr_mtx);
 		error = do_jail_attach(td, pr);
 		if (error) {
 			vfs_opterror(opts, "attach failed");
 			if (!created)
 				prison_deref(pr, PD_DEREF);
 			goto done_errmsg;
 		}
 	}
 
 #ifdef RACCT
-	if (!created) {
+	if (racct_enable && !created) {
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 		prison_racct_modify(pr);
 		if (!(flags & JAIL_ATTACH))
 			sx_slock(&allprison_lock);
 	}
 #endif
 
 	td->td_retval[0] = pr->pr_id;
 
 	/*
 	 * Now that it is all there, drop the temporary reference from existing
 	 * prisons.  Or add a reference to newly created persistent prisons
 	 * (which was not done earlier so that the prison would not be publicly
 	 * visible).
 	 */
 	if (!created) {
 		prison_deref(pr, (flags & JAIL_ATTACH)
 		    ? PD_DEREF
 		    : PD_DEREF | PD_LIST_SLOCKED);
 	} else {
 		if (pr_flags & PR_PERSIST) {
 			mtx_lock(&pr->pr_mtx);
 			pr->pr_ref++;
 			pr->pr_uref++;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 	}
 
 	goto done_errmsg;
 
  done_deref_locked:
 	prison_deref(pr, created
 	    ? PD_LOCKED | PD_LIST_XLOCKED
 	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 	goto done_releroot;
  done_unlock_list:
 	sx_xunlock(&allprison_lock);
  done_releroot:
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		if (errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (errmsg_pos > 0) {
 				if (optuio->uio_segflg == UIO_SYSSPACE)
 					bcopy(errmsg,
 					   optuio->uio_iov[errmsg_pos].iov_base,
 					   errmsg_len);
 				else
 					copyout(errmsg,
 					   optuio->uio_iov[errmsg_pos].iov_base,
 					   errmsg_len);
 			}
 		}
 	}
  done_free:
 #ifdef INET
 	free(ip4, M_PRISON);
 #endif
 #ifdef INET6
 	free(ip6, M_PRISON);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
 		    uap->iovcnt * sizeof (struct iovec));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
 			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
 				if (pr->pr_ref > 0 &&
 				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
 					break;
 				mtx_unlock(&pr->pr_mtx);
 			}
 		}
 		if (pr != NULL)
 			goto found_prison;
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done_unlock_list;
 		}
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_unlock_list;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 				mtx_unlock(&pr->pr_mtx);
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done_unlock_list;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done_unlock_list;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	pr->pr_ref++;
 	locked = PD_LOCKED;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
 	    fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
 	    fi++) {
 		i = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		i = pr_flag_jailsys[fi].disable &&
 		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
 		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error =
 		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
 	    fi++) {
 		if (pr_allow_names[fi] == NULL)
 			continue;
 		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	i = (pr->pr_uref == 0);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	locked = 0;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done_deref;
 	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_errmsg;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 	goto done_errmsg;
 
  done_deref:
 	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
 	goto done_errmsg;
 
  done_unlock_list:
 	sx_sunlock(&allprison_lock);
  done_errmsg:
 	if (error && errmsg_pos >= 0) {
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
 	struct prison *pr, *cpr, *lpr, *tpr;
 	int descend, error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Remove all descendants of this prison, then remove this prison. */
 	pr->pr_ref++;
 	pr->pr_flags |= PR_REMOVE;
 	if (!LIST_EMPTY(&pr->pr_children)) {
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 			mtx_lock(&cpr->pr_mtx);
 			if (cpr->pr_ref > 0) {
 				tpr = cpr;
 				cpr->pr_ref++;
 				cpr->pr_flags |= PR_REMOVE;
 			} else {
 				/* Already removed - do not do it again. */
 				tpr = NULL;
 			}
 			mtx_unlock(&cpr->pr_mtx);
 			if (lpr != NULL) {
 				mtx_lock(&lpr->pr_mtx);
 				prison_remove_one(lpr);
 				sx_xlock(&allprison_lock);
 			}
 			lpr = tpr;
 		}
 		if (lpr != NULL) {
 			mtx_lock(&lpr->pr_mtx);
 			prison_remove_one(lpr);
 			sx_xlock(&allprison_lock);
 		}
 		mtx_lock(&pr->pr_mtx);
 	}
 	prison_remove_one(pr);
 	return (0);
 }
 
 static void
 prison_remove_one(struct prison *pr)
 {
 	struct proc *p;
 	int deuref;
 
 	/* If the prison was persistent, it is not anymore. */
 	deuref = 0;
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_ref--;
 		deuref = PD_DEUREF;
 		pr->pr_flags &= ~PR_PERSIST;
 	}
 
 	/*
 	 * jail_remove added a reference.  If that's the only one, remove
 	 * the prison now.
 	 */
 	KASSERT(pr->pr_ref > 0,
 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
 	if (pr->pr_ref == 1) {
 		prison_deref(pr,
 		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 		return;
 	}
 
 	mtx_unlock(&pr->pr_mtx);
 	sx_xunlock(&allprison_lock);
 	/*
 	 * Kill all processes unfortunate enough to be attached to this prison.
 	 */
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred &&
 		    p->p_ucred->cr_prison == pr)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	/* Remove the temporary reference added by jail_remove. */
 	prison_deref(pr, deuref | PD_DEREF);
 }
 
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/*
 	 * Do not allow a process to attach to a prison that is not
 	 * considered to be "alive".
 	 */
 	if (pr->pr_uref == 0) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	return (do_jail_attach(td, pr));
 }
 
 static int
 do_jail_attach(struct thread *td, struct prison *pr)
 {
 	struct prison *ppr;
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	pr->pr_ref++;
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
 		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
 		return (error);
 	}
 	sx_sunlock(&allprison_lock);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	ppr = td->td_ucred->cr_prison;
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root, 0);
 	if ((error = change_root(pr->pr_root, td)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	newcred->cr_prison = pr;
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 #endif
 	crfree(oldcred);
 	prison_deref(ppr, PD_DEREF | PD_DEUREF);
 	return (0);
  e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
 	prison_deref(pr, PD_DEREF | PD_DEUREF);
 	return (error);
 }
 
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
  again:
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0) {
 				if (pr->pr_uref > 0)
 					return (pr);
 				deadpr = pr;
 			}
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
 	if (deadpr != NULL) {
 		mtx_lock(&deadpr->pr_mtx);
 		if (deadpr->pr_ref == 0) {
 			mtx_unlock(&deadpr->pr_mtx);
 			goto again;
 		}
 	}
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.
  */
 int
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_flags & flag);
 }
 
 int
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_allow & flag);
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, remove the
  * prison itself - but not in this context in case there are locks held.
  */
 void
 prison_free_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	pr->pr_ref--;
 	if (pr->pr_ref == 0) {
 		mtx_unlock(&pr->pr_mtx);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_free_locked(pr);
 }
 
 static void
 prison_complete(void *context, int pending)
 {
 
 	prison_deref((struct prison *)context, 0);
 }
 
 /*
  * Remove a prison reference (usually).  This internal version assumes no
  * mutexes are held, except perhaps the prison itself.  If there are no more
  * references, release and delist the prison.  On completion, the prison lock
  * and the allprison lock are both unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prison *ppr, *tpr;
 
 	if (!(flags & PD_LOCKED))
 		mtx_lock(&pr->pr_mtx);
 	for (;;) {
 		if (flags & PD_DEUREF) {
 			pr->pr_uref--;
 			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
 		}
 		if (flags & PD_DEREF)
 			pr->pr_ref--;
 		/* If the prison still has references, nothing else to do. */
 		if (pr->pr_ref > 0) {
 			mtx_unlock(&pr->pr_mtx);
 			if (flags & PD_LIST_SLOCKED)
 				sx_sunlock(&allprison_lock);
 			else if (flags & PD_LIST_XLOCKED)
 				sx_xunlock(&allprison_lock);
 			return;
 		}
 
 		mtx_unlock(&pr->pr_mtx);
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
 				sx_xlock(&allprison_lock);
 			}
 		} else if (!(flags & PD_LIST_XLOCKED))
 			sx_xlock(&allprison_lock);
 
 		TAILQ_REMOVE(&allprison, pr, pr_list);
 		LIST_REMOVE(pr, pr_sibling);
 		ppr = pr->pr_parent;
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount--;
 		sx_xunlock(&allprison_lock);
 
 #ifdef VIMAGE
 		if (pr->pr_vnet != ppr->pr_vnet)
 			vnet_destroy(pr->pr_vnet);
 #endif
 		if (pr->pr_root != NULL)
 			vrele(pr->pr_root);
 		mtx_destroy(&pr->pr_mtx);
 #ifdef INET
 		free(pr->pr_ip4, M_PRISON);
 #endif
 #ifdef INET6
 		free(pr->pr_ip6, M_PRISON);
 #endif
 		if (pr->pr_cpuset != NULL)
 			cpuset_rel(pr->pr_cpuset);
 		osd_jail_exit(pr);
 #ifdef RACCT
-		prison_racct_detach(pr);
+		if (racct_enable)
+			prison_racct_detach(pr);
 #endif
 		free(pr, M_PRISON);
 
 		/* Removing a prison frees a reference on its parent. */
 		pr = ppr;
 		mtx_lock(&pr->pr_mtx);
 		flags = PD_DEREF | PD_DEUREF;
 	}
 }
 
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	KASSERT(pr->pr_ref > 0,
 	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
 	pr->pr_ref++;
 }
 
 void
 prison_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	prison_deref(pr, PD_DEUREF | PD_LOCKED);
 }
 
 
 #ifdef INET
 /*
  * Restrict a prison's IP address list with its parent's, possibly replacing
  * it.  Return true if the replacement buffer was used (or would have been).
  */
 static int
 prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
 {
 	int ii, ij, used;
 	struct prison *ppr;
 
 	ppr = pr->pr_parent;
 	if (!(pr->pr_flags & PR_IP4_USER)) {
 		/* This has no user settings, so just copy the parent's list. */
 		if (pr->pr_ip4s < ppr->pr_ip4s) {
 			/*
 			 * There's no room for the parent's list.  Use the
 			 * new list buffer, which is assumed to be big enough
 			 * (if it was passed).  If there's no buffer, try to
 			 * allocate one.
 			 */
 			used = 1;
 			if (newip4 == NULL) {
 				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
 				    M_PRISON, M_NOWAIT);
 				if (newip4 != NULL)
 					used = 0;
 			}
 			if (newip4 != NULL) {
 				bcopy(ppr->pr_ip4, newip4,
 				    ppr->pr_ip4s * sizeof(*newip4));
 				free(pr->pr_ip4, M_PRISON);
 				pr->pr_ip4 = newip4;
 				pr->pr_ip4s = ppr->pr_ip4s;
 			}
 			return (used);
 		}
 		pr->pr_ip4s = ppr->pr_ip4s;
 		if (pr->pr_ip4s > 0)
 			bcopy(ppr->pr_ip4, pr->pr_ip4,
 			    pr->pr_ip4s * sizeof(*newip4));
 		else if (pr->pr_ip4 != NULL) {
 			free(pr->pr_ip4, M_PRISON);
 			pr->pr_ip4 = NULL;
 		}
 	} else if (pr->pr_ip4s > 0) {
 		/* Remove addresses that aren't in the parent. */
 		for (ij = 0; ij < ppr->pr_ip4s; ij++)
 			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 				break;
 		if (ij < ppr->pr_ip4s)
 			ii = 1;
 		else {
 			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
 			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
 			ii = 0;
 		}
 		for (ij = 1; ii < pr->pr_ip4s; ) {
 			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
 				ii++;
 				continue;
 			}
 			switch (ij >= ppr->pr_ip4s ? -1 :
 				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
 			case -1:
 				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
 				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
 				break;
 			case 0:
 				ii++;
 				ij++;
 				break;
 			case 1:
 				ij++;
 				break;
 			}
 		}
 		if (pr->pr_ip4s == 0) {
 			pr->pr_flags |= PR_IP4_DISABLE;
 			free(pr->pr_ip4, M_PRISON);
 			pr->pr_ip4 = NULL;
 		}
 	}
 	return (0);
 }
 
 /*
  * Pass back primary IPv4 address of this jail.
  *
  * If not restricted return success but do not alter the address.  Caller has
  * to make sure to initialize it correctly (e.g. INADDR_ANY).
  *
  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
  * Address returned in NBO.
  */
 int
 prison_get_ip4(struct ucred *cred, struct in_addr *ia)
 {
 	struct prison *pr;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP4))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP4)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip4 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	ia->s_addr = pr->pr_ip4[0].s_addr;
 	mtx_unlock(&pr->pr_mtx);
 	return (0);
 }
 
 /*
  * Return 1 if we should do proper source address selection or are not jailed.
  * We will return 0 if we should bypass source address selection in favour
  * of the primary jail IPv4 address. Only in this case *ia will be updated and
  * returned in NBO.
  * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
  */
 int
 prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
 {
 	struct prison *pr;
 	struct in_addr lia;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
 
 	if (!jailed(cred))
 		return (1);
 
 	pr = cred->cr_prison;
 	if (pr->pr_flags & PR_IP4_SADDRSEL)
 		return (1);
 
 	lia.s_addr = INADDR_ANY;
 	error = prison_get_ip4(cred, &lia);
 	if (error)
 		return (error);
 	if (lia.s_addr == INADDR_ANY)
 		return (1);
 
 	ia->s_addr = lia.s_addr;
 	return (0);
 }
 
 /*
  * Return true if pr1 and pr2 have the same IPv4 address restrictions.
  */
 int
 prison_equal_ip4(struct prison *pr1, struct prison *pr2)
 {
 
 	if (pr1 == pr2)
 		return (1);
 
 	/*
 	 * No need to lock since the PR_IP4_USER flag can't be altered for
 	 * existing prisons.
 	 */
 	while (pr1 != &prison0 &&
 #ifdef VIMAGE
 	       !(pr1->pr_flags & PR_VNET) &&
 #endif
 	       !(pr1->pr_flags & PR_IP4_USER))
 		pr1 = pr1->pr_parent;
 	while (pr2 != &prison0 &&
 #ifdef VIMAGE
 	       !(pr2->pr_flags & PR_VNET) &&
 #endif
 	       !(pr2->pr_flags & PR_IP4_USER))
 		pr2 = pr2->pr_parent;
 	return (pr1 == pr2);
 }
 
 /*
  * Make sure our (source) address is set to something meaningful to this
  * jail.
  *
  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
  * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
  */
 int
 prison_local_ip4(struct ucred *cred, struct in_addr *ia)
 {
 	struct prison *pr;
 	struct in_addr ia0;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP4))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP4)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip4 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	ia0.s_addr = ntohl(ia->s_addr);
 	if (ia0.s_addr == INADDR_LOOPBACK) {
 		ia->s_addr = pr->pr_ip4[0].s_addr;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	if (ia0.s_addr == INADDR_ANY) {
 		/*
 		 * In case there is only 1 IPv4 address, bind directly.
 		 */
 		if (pr->pr_ip4s == 1)
 			ia->s_addr = pr->pr_ip4[0].s_addr;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	error = _prison_check_ip4(pr, ia);
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 /*
  * Rewrite destination address in case we will connect to loopback address.
  *
  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
  * Address passed in in NBO and returned in NBO.
  */
 int
 prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
 {
 	struct prison *pr;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP4))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP4)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip4 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
 		ia->s_addr = pr->pr_ip4[0].s_addr;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	/*
 	 * Return success because nothing had to be changed.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	return (0);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred/prison.
  *
  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
  * doesn't allow IPv4.  Address passed in in NBO.
  */
 static int
 _prison_check_ip4(struct prison *pr, struct in_addr *ia)
 {
 	int i, a, z, d;
 
 	/*
 	 * Check the primary IP.
 	 */
 	if (pr->pr_ip4[0].s_addr == ia->s_addr)
 		return (0);
 
 	/*
 	 * All the other IPs are sorted so we can do a binary search.
 	 */
 	a = 0;
 	z = pr->pr_ip4s - 2;
 	while (a <= z) {
 		i = (a + z) / 2;
 		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
 		if (d > 0)
 			z = i - 1;
 		else if (d < 0)
 			a = i + 1;
 		else
 			return (0);
 	}
 
 	return (EADDRNOTAVAIL);
 }
 
 int
 prison_check_ip4(struct ucred *cred, struct in_addr *ia)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP4))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP4)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip4 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	error = _prison_check_ip4(pr, ia);
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 #endif
 
 #ifdef INET6
 static int
 prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
 {
 	int ii, ij, used;
 	struct prison *ppr;
 
 	ppr = pr->pr_parent;
 	if (!(pr->pr_flags & PR_IP6_USER)) {
 		/* This has no user settings, so just copy the parent's list. */
 		if (pr->pr_ip6s < ppr->pr_ip6s) {
 			/*
 			 * There's no room for the parent's list.  Use the
 			 * new list buffer, which is assumed to be big enough
 			 * (if it was passed).  If there's no buffer, try to
 			 * allocate one.
 			 */
 			used = 1;
 			if (newip6 == NULL) {
 				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
 				    M_PRISON, M_NOWAIT);
 				if (newip6 != NULL)
 					used = 0;
 			}
 			if (newip6 != NULL) {
 				bcopy(ppr->pr_ip6, newip6,
 				    ppr->pr_ip6s * sizeof(*newip6));
 				free(pr->pr_ip6, M_PRISON);
 				pr->pr_ip6 = newip6;
 				pr->pr_ip6s = ppr->pr_ip6s;
 			}
 			return (used);
 		}
 		pr->pr_ip6s = ppr->pr_ip6s;
 		if (pr->pr_ip6s > 0)
 			bcopy(ppr->pr_ip6, pr->pr_ip6,
 			    pr->pr_ip6s * sizeof(*newip6));
 		else if (pr->pr_ip6 != NULL) {
 			free(pr->pr_ip6, M_PRISON);
 			pr->pr_ip6 = NULL;
 		}
 	} else if (pr->pr_ip6s > 0) {
 		/* Remove addresses that aren't in the parent. */
 		for (ij = 0; ij < ppr->pr_ip6s; ij++)
 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
 			    &ppr->pr_ip6[ij]))
 				break;
 		if (ij < ppr->pr_ip6s)
 			ii = 1;
 		else {
 			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
 			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
 			ii = 0;
 		}
 		for (ij = 1; ii < pr->pr_ip6s; ) {
 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
 			    &ppr->pr_ip6[0])) {
 				ii++;
 				continue;
 			}
 			switch (ij >= ppr->pr_ip6s ? -1 :
 				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
 			case -1:
 				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
 				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
 				break;
 			case 0:
 				ii++;
 				ij++;
 				break;
 			case 1:
 				ij++;
 				break;
 			}
 		}
 		if (pr->pr_ip6s == 0) {
 			pr->pr_flags |= PR_IP6_DISABLE;
 			free(pr->pr_ip6, M_PRISON);
 			pr->pr_ip6 = NULL;
 		}
 	}
 	return 0;
 }
 
 /*
  * Pass back primary IPv6 address for this jail.
  *
  * If not restricted return success but do not alter the address.  Caller has
  * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
  *
  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
  */
 int
 prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
 {
 	struct prison *pr;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP6))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP6)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip6 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
 	mtx_unlock(&pr->pr_mtx);
 	return (0);
 }
 
 /*
  * Return 1 if we should do proper source address selection or are not jailed.
  * We will return 0 if we should bypass source address selection in favour
  * of the primary jail IPv6 address. Only in this case *ia will be updated and
  * returned in NBO.
  * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
  */
 int
 prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
 {
 	struct prison *pr;
 	struct in6_addr lia6;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
 
 	if (!jailed(cred))
 		return (1);
 
 	pr = cred->cr_prison;
 	if (pr->pr_flags & PR_IP6_SADDRSEL)
 		return (1);
 
 	lia6 = in6addr_any;
 	error = prison_get_ip6(cred, &lia6);
 	if (error)
 		return (error);
 	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
 		return (1);
 
 	bcopy(&lia6, ia6, sizeof(struct in6_addr));
 	return (0);
 }
 
 /*
  * Return true if pr1 and pr2 have the same IPv6 address restrictions.
  */
 int
 prison_equal_ip6(struct prison *pr1, struct prison *pr2)
 {
 
 	if (pr1 == pr2)
 		return (1);
 
 	while (pr1 != &prison0 &&
 #ifdef VIMAGE
 	       !(pr1->pr_flags & PR_VNET) &&
 #endif
 	       !(pr1->pr_flags & PR_IP6_USER))
 		pr1 = pr1->pr_parent;
 	while (pr2 != &prison0 &&
 #ifdef VIMAGE
 	       !(pr2->pr_flags & PR_VNET) &&
 #endif
 	       !(pr2->pr_flags & PR_IP6_USER))
 		pr2 = pr2->pr_parent;
 	return (pr1 == pr2);
 }
 
 /*
  * Make sure our (source) address is set to something meaningful to this jail.
  *
  * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
  * when needed while binding.
  *
  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
  * doesn't allow IPv6.
  */
 int
 prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP6))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP6)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip6 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
 		/*
 		 * In case there is only 1 IPv6 address, and v6only is true,
 		 * then bind directly.
 		 */
 		if (v6only != 0 && pr->pr_ip6s == 1)
 			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	error = _prison_check_ip6(pr, ia6);
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 /*
  * Rewrite destination address in case we will connect to loopback address.
  *
  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
  */
 int
 prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
 {
 	struct prison *pr;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP6))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP6)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip6 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 
 	/*
 	 * Return success because nothing had to be changed.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	return (0);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred/prison.
  *
  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
  * doesn't allow IPv6.
  */
 static int
 _prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
 {
 	int i, a, z, d;
 
 	/*
 	 * Check the primary IP.
 	 */
 	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
 		return (0);
 
 	/*
 	 * All the other IPs are sorted so we can do a binary search.
 	 */
 	a = 0;
 	z = pr->pr_ip6s - 2;
 	while (a <= z) {
 		i = (a + z) / 2;
 		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
 		if (d > 0)
 			z = i - 1;
 		else if (d < 0)
 			a = i + 1;
 		else
 			return (0);
 	}
 
 	return (EADDRNOTAVAIL);
 }
 
 int
 prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
 
 	pr = cred->cr_prison;
 	if (!(pr->pr_flags & PR_IP6))
 		return (0);
 	mtx_lock(&pr->pr_mtx);
 	if (!(pr->pr_flags & PR_IP6)) {
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	if (pr->pr_ip6 == NULL) {
 		mtx_unlock(&pr->pr_mtx);
 		return (EAFNOSUPPORT);
 	}
 
 	error = _prison_check_ip6(pr, ia6);
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 #endif
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, struct sockaddr *sa)
 {
 #ifdef INET
 	struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * Return 1 if p2 is a child of p1, otherwise 0.
  */
 int
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (1);
 	return (0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 int
 jailed(struct ucred *cred)
 {
 
 	return (cred->cr_prison != &prison0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise 0.
  */
 int
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (0);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	return (1);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison represented by cred owns
  * its vnet rather than having it inherited.
  *
  * Returns 1 in case the prison owns the vnet, 0 otherwise.
  */
 int
 prison_owns_vnet(struct ucred *cred)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 #ifdef notyet
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_GETKEY:
 #ifdef notyet
 	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
 #endif
 
 #ifdef notyet
 		/*
 		 * AppleTalk privileges.
 		 */
 	case PRIV_NETATALK_RESERVEDPORT:
 
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * IPX/SPX privileges.
 		 */
 	case PRIV_NETIPX_RESERVEDPORT:
 	case PRIV_NETIPX_RAW:
 
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
 		    cred->cr_prison->pr_enforce_statfs < 2)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to bind reserved ports and reuse in-use
 		 * ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certian IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 
 /*
  * Jail-related sysctls.
  */
 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jails");
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 #if defined(INET) || defined(INET6)
  again:
 #endif
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		if (cpr->pr_ip4s > 0) {
 			if (ip4s < cpr->pr_ip4s) {
 				ip4s = cpr->pr_ip4s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip4 = realloc(ip4, ip4s *
 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip4, ip4,
 			    cpr->pr_ip4s * sizeof(struct in_addr));
 		}
 #endif
 #ifdef INET6
 		if (cpr->pr_ip6s > 0) {
 			if (ip6s < cpr->pr_ip6s) {
 				ip6s = cpr->pr_ip6s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip6 = realloc(ip6, ip6s *
 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip6, ip6,
 			    cpr->pr_ip6s * sizeof(struct in6_addr));
 		}
 #endif
 		if (cpr->pr_ref == 0) {
 			mtx_unlock(&cpr->pr_mtx);
 			continue;
 		}
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = cpr->pr_uref > 0
 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = cpr->pr_ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = cpr->pr_ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns VNET?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family");
 #endif
 
 /*
  * Default parameters for jail(2) compatability.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int allow, error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	i = (allow & arg2) ? 1 : 0;
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the devfs file system");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the fdescfs file system");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the nullfs file system");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the procfs file system");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the tmpfs file system");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the zfs file system");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the devfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the fdescfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the nullfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the procfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the tmpfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the zfs file system");
 
+#ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_slock(&allprison_lock);
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 	int old;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	old = prr->prr_refcount;
 	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
-#ifdef RACCT
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 	struct proc *p;
 	struct ucred *cred;
 	struct prison_racct *oldprr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		racct_proc_ucred_changed(p, cred, cred);
 		crfree(cred);
 	}
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	int fi;
 #if defined(INET) || defined(INET6)
 	int ii;
 #endif
 	unsigned jsf;
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
 	    fi++)
 		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
 			db_printf(" %s", pr_flag_names[fi]);
 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
 	    fi++) {
 		jsf = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
 		    pr_flag_jailsys[fi].disable && 
 		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
 		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
 	    fi++)
 		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
 			db_printf(" %s", pr_allow_names[fi]);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
 	for (ii = 0; ii < pr->pr_ip4s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip4.addr        =" : "                 ",
 		    inet_ntoa(pr->pr_ip4[ii]));
 #endif
 #ifdef INET6
 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
 	for (ii = 0; ii < pr->pr_ip6s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip6.addr        =" : "                 ",
 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
Index: stable/10/sys/kern/kern_racct.c
===================================================================
--- stable/10/sys/kern/kern_racct.c	(revision 284664)
+++ stable/10/sys/kern/kern_racct.c	(revision 284665)
@@ -1,1291 +1,1387 @@
 /*-
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/umtx.h>
 #include <machine/smp.h>
 
 #ifdef RCTL
 #include <sys/rctl.h>
 #endif
 
 #ifdef RACCT
 
 FEATURE(racct, "Resource Accounting");
 
 /*
  * Do not block processes that have their %cpu usage <= pcpu_threshold.
  */
 static int pcpu_threshold = 1;
+#ifdef RACCT_DEFAULT_TO_DISABLED
+int racct_enable = 0;
+#else
+int racct_enable = 1;
+#endif
 
 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
+SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
+    0, "Enable RACCT/RCTL");
 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
     0, "Processes with higher %cpu usage than this value can be throttled.");
 
 /*
  * How many seconds it takes to use the scheduler %cpu calculations.  When a
  * process starts, we compute its %cpu usage by dividing its runtime by the
  * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
  * provided by the scheduler.
  */
 #define RACCT_PCPU_SECS		3
 
 static struct mtx racct_lock;
 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
 
 static uma_zone_t racct_zone;
 
 static void racct_sub_racct(struct racct *dest, const struct racct *src);
 static void racct_sub_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 static void racct_add_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 
 SDT_PROVIDER_DEFINE(racct);
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int",
     "uint64_t");
 SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *",
     "int", "uint64_t");
 SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *");
 SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *",
     "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure,
     "struct racct *", "struct racct *");
 SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *",
     "struct racct *");
 
 int racct_types[] = {
 	[RACCT_CPU] =
 		RACCT_IN_MILLIONS,
 	[RACCT_DATA] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_STACK] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_CORE] =
 		RACCT_DENIABLE,
 	[RACCT_RSS] =
 		RACCT_RECLAIMABLE,
 	[RACCT_MEMLOCK] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NPROC] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NOFILE] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_VMEM] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NPTS] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SWAP] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NTHR] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_MSGQQUEUED] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_MSGQSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NMSGQ] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEMOP] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NSHM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SHMSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_WALLCLOCK] =
 		RACCT_IN_MILLIONS,
 	[RACCT_PCTCPU] =
 		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
 
 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
 
 #ifdef SCHED_4BSD
 /*
  * Contains intermediate values for %cpu calculations to avoid using floating
  * point in the kernel.
  * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
  * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
  * zero so the calculations are more straightforward.
  */
 fixpt_t ccpu_exp[] = {
 	[0] = FSCALE * 1,
 	[1] = FSCALE * 0.95122942450071400909,
 	[2] = FSCALE * 0.90483741803595957316,
 	[3] = FSCALE * 0.86070797642505780722,
 	[4] = FSCALE * 0.81873075307798185866,
 	[5] = FSCALE * 0.77880078307140486824,
 	[6] = FSCALE * 0.74081822068171786606,
 	[7] = FSCALE * 0.70468808971871343435,
 	[8] = FSCALE * 0.67032004603563930074,
 	[9] = FSCALE * 0.63762815162177329314,
 	[10] = FSCALE * 0.60653065971263342360,
 	[11] = FSCALE * 0.57694981038048669531,
 	[12] = FSCALE * 0.54881163609402643262,
 	[13] = FSCALE * 0.52204577676101604789,
 	[14] = FSCALE * 0.49658530379140951470,
 	[15] = FSCALE * 0.47236655274101470713,
 	[16] = FSCALE * 0.44932896411722159143,
 	[17] = FSCALE * 0.42741493194872666992,
 	[18] = FSCALE * 0.40656965974059911188,
 	[19] = FSCALE * 0.38674102345450120691,
 	[20] = FSCALE * 0.36787944117144232159,
 	[21] = FSCALE * 0.34993774911115535467,
 	[22] = FSCALE * 0.33287108369807955328,
 	[23] = FSCALE * 0.31663676937905321821,
 	[24] = FSCALE * 0.30119421191220209664,
 	[25] = FSCALE * 0.28650479686019010032,
 	[26] = FSCALE * 0.27253179303401260312,
 	[27] = FSCALE * 0.25924026064589150757,
 	[28] = FSCALE * 0.24659696394160647693,
 	[29] = FSCALE * 0.23457028809379765313,
 	[30] = FSCALE * 0.22313016014842982893,
 	[31] = FSCALE * 0.21224797382674305771,
 	[32] = FSCALE * 0.20189651799465540848,
 	[33] = FSCALE * 0.19204990862075411423,
 	[34] = FSCALE * 0.18268352405273465022,
 	[35] = FSCALE * 0.17377394345044512668,
 	[36] = FSCALE * 0.16529888822158653829,
 	[37] = FSCALE * 0.15723716631362761621,
 	[38] = FSCALE * 0.14956861922263505264,
 	[39] = FSCALE * 0.14227407158651357185,
 	[40] = FSCALE * 0.13533528323661269189,
 	[41] = FSCALE * 0.12873490358780421886,
 	[42] = FSCALE * 0.12245642825298191021,
 	[43] = FSCALE * 0.11648415777349695786,
 	[44] = FSCALE * 0.11080315836233388333,
 	[45] = FSCALE * 0.10539922456186433678,
 	[46] = FSCALE * 0.10025884372280373372,
 	[47] = FSCALE * 0.09536916221554961888,
 	[48] = FSCALE * 0.09071795328941250337,
 	[49] = FSCALE * 0.08629358649937051097,
 	[50] = FSCALE * 0.08208499862389879516,
 	[51] = FSCALE * 0.07808166600115315231,
 	[52] = FSCALE * 0.07427357821433388042,
 	[53] = FSCALE * 0.07065121306042958674,
 	[54] = FSCALE * 0.06720551273974976512,
 	[55] = FSCALE * 0.06392786120670757270,
 	[56] = FSCALE * 0.06081006262521796499,
 	[57] = FSCALE * 0.05784432087483846296,
 	[58] = FSCALE * 0.05502322005640722902,
 	[59] = FSCALE * 0.05233970594843239308,
 	[60] = FSCALE * 0.04978706836786394297,
 	[61] = FSCALE * 0.04735892439114092119,
 	[62] = FSCALE * 0.04504920239355780606,
 	[63] = FSCALE * 0.04285212686704017991,
 	[64] = FSCALE * 0.04076220397836621516,
 	[65] = FSCALE * 0.03877420783172200988,
 	[66] = FSCALE * 0.03688316740124000544,
 	[67] = FSCALE * 0.03508435410084502588,
 	[68] = FSCALE * 0.03337326996032607948,
 	[69] = FSCALE * 0.03174563637806794323,
 	[70] = FSCALE * 0.03019738342231850073,
 	[71] = FSCALE * 0.02872463965423942912,
 	[72] = FSCALE * 0.02732372244729256080,
 	[73] = FSCALE * 0.02599112877875534358,
 	[74] = FSCALE * 0.02472352647033939120,
 	[75] = FSCALE * 0.02351774585600910823,
 	[76] = FSCALE * 0.02237077185616559577,
 	[77] = FSCALE * 0.02127973643837716938,
 	[78] = FSCALE * 0.02024191144580438847,
 	[79] = FSCALE * 0.01925470177538692429,
 	[80] = FSCALE * 0.01831563888873418029,
 	[81] = FSCALE * 0.01742237463949351138,
 	[82] = FSCALE * 0.01657267540176124754,
 	[83] = FSCALE * 0.01576441648485449082,
 	[84] = FSCALE * 0.01499557682047770621,
 	[85] = FSCALE * 0.01426423390899925527,
 	[86] = FSCALE * 0.01356855901220093175,
 	[87] = FSCALE * 0.01290681258047986886,
 	[88] = FSCALE * 0.01227733990306844117,
 	[89] = FSCALE * 0.01167856697039544521,
 	[90] = FSCALE * 0.01110899653824230649,
 	[91] = FSCALE * 0.01056720438385265337,
 	[92] = FSCALE * 0.01005183574463358164,
 	[93] = FSCALE * 0.00956160193054350793,
 	[94] = FSCALE * 0.00909527710169581709,
 	[95] = FSCALE * 0.00865169520312063417,
 	[96] = FSCALE * 0.00822974704902002884,
 	[97] = FSCALE * 0.00782837754922577143,
 	[98] = FSCALE * 0.00744658307092434051,
 	[99] = FSCALE * 0.00708340892905212004,
 	[100] = FSCALE * 0.00673794699908546709,
 	[101] = FSCALE * 0.00640933344625638184,
 	[102] = FSCALE * 0.00609674656551563610,
 	[103] = FSCALE * 0.00579940472684214321,
 	[104] = FSCALE * 0.00551656442076077241,
 	[105] = FSCALE * 0.00524751839918138427,
 	[106] = FSCALE * 0.00499159390691021621,
 	[107] = FSCALE * 0.00474815099941147558,
 	[108] = FSCALE * 0.00451658094261266798,
 	[109] = FSCALE * 0.00429630469075234057,
 	[110] = FSCALE * 0.00408677143846406699,
 };
 #endif
 
 #define	CCPU_EXP_MAX	110
 
 /*
  * This function is analogical to the getpcpu() function in the ps(1) command.
  * They should both calculate in the same way so that the racct %cpu
  * calculations are consistent with the values showed by the ps(1) tool.
  * The calculations are more complex in the 4BSD scheduler because of the value
  * of the ccpu variable.  In ULE it is defined to be zero which saves us some
  * work.
  */
 static uint64_t
 racct_getpcpu(struct proc *p, u_int pcpu)
 {
 	u_int swtime;
 #ifdef SCHED_4BSD
 	fixpt_t pctcpu, pctcpu_next;
 #endif
 #ifdef SMP
 	struct pcpu *pc;
 	int found;
 #endif
 	fixpt_t p_pctcpu;
 	struct thread *td;
 
+	ASSERT_RACCT_ENABLED();
+
 	/*
 	 * If the process is swapped out, we count its %cpu usage as zero.
 	 * This behaviour is consistent with the userland ps(1) tool.
 	 */
 	if ((p->p_flag & P_INMEM) == 0)
 		return (0);
 	swtime = (ticks - p->p_swtick) / hz;
 
 	/*
 	 * For short-lived processes, the sched_pctcpu() returns small
 	 * values even for cpu intensive processes.  Therefore we use
 	 * our own estimate in this case.
 	 */
 	if (swtime < RACCT_PCPU_SECS)
 		return (pcpu);
 
 	p_pctcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td == PCPU_GET(idlethread))
 			continue;
 #ifdef SMP
 		found = 0;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			if (td == pc->pc_idlethread) {
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			continue;
 #endif
 		thread_lock(td);
 #ifdef SCHED_4BSD
 		pctcpu = sched_pctcpu(td);
 		/* Count also the yet unfinished second. */
 		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
 		pctcpu_next += sched_pctcpu_delta(td);
 		p_pctcpu += max(pctcpu, pctcpu_next);
 #else
 		/*
 		 * In ULE the %cpu statistics are updated on every
 		 * sched_pctcpu() call.  So special calculations to
 		 * account for the latest (unfinished) second are
 		 * not needed.
 		 */
 		p_pctcpu += sched_pctcpu(td);
 #endif
 		thread_unlock(td);
 	}
 
 #ifdef SCHED_4BSD
 	if (swtime <= CCPU_EXP_MAX)
 		return ((100 * (uint64_t)p_pctcpu * 1000000) /
 		    (FSCALE - ccpu_exp[swtime]));
 #endif
 
 	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
 }
 
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
+	ASSERT_RACCT_ENABLED();
 	mtx_assert(&racct_lock, MA_OWNED);
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		KASSERT(dest->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: dest < 0",
 		    __func__, i));
 		KASSERT(src->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: src < 0",
 		    __func__, i));
 		dest->r_resources[i] += src->r_resources[i];
 	}
 }
 
 static void
 racct_sub_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
+	ASSERT_RACCT_ENABLED();
 	mtx_assert(&racct_lock, MA_OWNED);
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
 			KASSERT(dest->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: dest < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: src < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] <= dest->r_resources[i],
 			    ("%s: resource %d propagation meltdown: src > dest",
 			    __func__, i));
 		}
 		if (RACCT_CAN_DROP(i)) {
 			dest->r_resources[i] -= src->r_resources[i];
 			if (dest->r_resources[i] < 0) {
 				KASSERT(RACCT_IS_SLOPPY(i) ||
 				    RACCT_IS_DECAYING(i),
 				    ("%s: resource %d usage < 0", __func__, i));
 				dest->r_resources[i] = 0;
 			}
 		}
 	}
 }
 
 void
 racct_create(struct racct **racctp)
 {
 
+	if (!racct_enable)
+		return;
+
 	SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
 
 	KASSERT(*racctp == NULL, ("racct already allocated"));
 
 	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
 }
 
 static void
 racct_destroy_locked(struct racct **racctp)
 {
 	int i;
 	struct racct *racct;
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
 
 	mtx_assert(&racct_lock, MA_OWNED);
 	KASSERT(racctp != NULL, ("NULL racctp"));
 	KASSERT(*racctp != NULL, ("NULL racct"));
 
 	racct = *racctp;
 
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (RACCT_IS_SLOPPY(i))
 			continue;
 		if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		KASSERT(racct->r_resources[i] == 0,
 		    ("destroying non-empty racct: "
 		    "%ju allocated for resource %d\n",
 		    racct->r_resources[i], i));
 	}
 	uma_zfree(racct_zone, racct);
 	*racctp = NULL;
 }
 
 void
 racct_destroy(struct racct **racct)
 {
 
+	if (!racct_enable)
+		return;
+
 	mtx_lock(&racct_lock);
 	racct_destroy_locked(racct);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Increase consumption of 'resource' by 'amount' for 'racct'
  * and all its parents.  Differently from other cases, 'amount' here
  * may be less than zero.
  */
 static void
 racct_alloc_resource(struct racct *racct, int resource,
     uint64_t amount)
 {
 
+	ASSERT_RACCT_ENABLED();
 	mtx_assert(&racct_lock, MA_OWNED);
 	KASSERT(racct != NULL, ("NULL racct"));
 
 	racct->r_resources[resource] += amount;
 	if (racct->r_resources[resource] < 0) {
 		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
 		    ("%s: resource %d usage < 0", __func__, resource));
 		racct->r_resources[resource] = 0;
 	}
 	
 	/*
 	 * There are some cases where the racct %cpu resource would grow
 	 * beyond 100%.
 	 * For example in racct_proc_exit() we add the process %cpu usage
 	 * to the ucred racct containers.  If too many processes terminated
 	 * in a short time span, the ucred %cpu resource could grow too much.
 	 * Also, the 4BSD scheduler sometimes returns for a thread more than
 	 * 100% cpu usage.  So we set a boundary here to 100%.
 	 */
 	if ((resource == RACCT_PCTCPU) &&
 	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
 		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
 }
 
 static int
 racct_add_locked(struct proc *p, int resource, uint64_t amount)
 {
 #ifdef RCTL
 	int error;
 #endif
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 #ifdef RCTL
 	error = rctl_enforce(p, resource, amount);
 	if (error && RACCT_IS_DENIABLE(resource)) {
 		SDT_PROBE(racct, kernel, rusage, add__failure, p, resource,
 		    amount, 0, 0);
 		return (error);
 	}
 #endif
 	racct_alloc_resource(p->p_racct, resource, amount);
 	racct_add_cred_locked(p->p_ucred, resource, amount);
 
 	return (0);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  */
 int
 racct_add(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
+	if (!racct_enable)
+		return (0);
+
 	mtx_lock(&racct_lock);
 	error = racct_add_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 	return (error);
 }
 
 static void
 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount,
 	    0, 0);
 
 	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
 		    amount);
 	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for credential 'cred'.
  * Doesn't check for limits and never fails.
  *
  * XXX: Shouldn't this ever return an error?
  */
 void
 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
+	if (!racct_enable)
+		return;
+
 	mtx_lock(&racct_lock);
 	racct_add_cred_locked(cred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Doesn't check for limits and never fails.
  */
 void
 racct_add_force(struct proc *p, int resource, uint64_t amount)
 {
 
+	if (!racct_enable)
+		return;
+
 	SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	mtx_lock(&racct_lock);
 	racct_alloc_resource(p->p_racct, resource, amount);
 	mtx_unlock(&racct_lock);
 	racct_add_cred(p->p_ucred, resource, amount);
 }
 
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount)
 {
 	int64_t old_amount, decayed_amount;
 	int64_t diff_proc, diff_cred;
 #ifdef RCTL
 	int error;
 #endif
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	old_amount = p->p_racct->r_resources[resource];
 	/*
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
 	if (RACCT_IS_DECAYING(resource)) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
 		 * between the new amount and the proportional value of the
 		 * old amount that has decayed in the ucred racct containers.
 		 */
 		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
 		diff_cred = amount - decayed_amount;
 	} else
 		diff_cred = diff_proc;
 #ifdef notyet
 	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
 	    ("%s: usage of non-droppable resource %d dropping", __func__,
 	     resource));
 #endif
 #ifdef RCTL
 	if (diff_proc > 0) {
 		error = rctl_enforce(p, resource, diff_proc);
 		if (error && RACCT_IS_DENIABLE(resource)) {
 			SDT_PROBE(racct, kernel, rusage, set__failure, p,
 			    resource, amount, 0, 0);
 			return (error);
 		}
 	}
 #endif
 	racct_alloc_resource(p->p_racct, resource, diff_proc);
 	if (diff_cred > 0)
 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
 	else if (diff_cred < 0)
 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 
 	return (0);
 }
 
 /*
  * Set allocation of 'resource' to 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  *
  * Note that decreasing the allocation always returns 0,
  * even if it's above the limit.
  */
 int
 racct_set(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
+	if (!racct_enable)
+		return (0);
+
 	mtx_lock(&racct_lock);
 	error = racct_set_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 	return (error);
 }
 
 static void
 racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
 {
 	int64_t old_amount, decayed_amount;
 	int64_t diff_proc, diff_cred;
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	old_amount = p->p_racct->r_resources[resource];
 	/*
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
 	if (RACCT_IS_DECAYING(resource)) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
 		 * between the new amount and the proportional value of the
 		 * old amount that has decayed in the ucred racct containers.
 		 */
 		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
 		diff_cred = amount - decayed_amount;
 	} else
 		diff_cred = diff_proc;
 
 	racct_alloc_resource(p->p_racct, resource, diff_proc);
 	if (diff_cred > 0)
 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
 	else if (diff_cred < 0)
 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 }
 
 void
 racct_set_force(struct proc *p, int resource, uint64_t amount)
 {
+
+	if (!racct_enable)
+		return;
+
 	mtx_lock(&racct_lock);
 	racct_set_force_locked(p, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * not matter.
  */
 uint64_t
 racct_get_limit(struct proc *p, int resource)
 {
 
+	if (!racct_enable)
+		return (UINT64_MAX);
+
 #ifdef RCTL
 	return (rctl_get_limit(p, resource));
 #else
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * matter.
  */
 uint64_t
 racct_get_available(struct proc *p, int resource)
 {
 
+	if (!racct_enable)
+		return (UINT64_MAX);
+
 #ifdef RCTL
 	return (rctl_get_available(p, resource));
 #else
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of the %cpu resource that process 'p' can add to its %cpu
  * utilization.  Adding more than that would lead to the process being
  * throttled.
  */
 static int64_t
 racct_pcpu_available(struct proc *p)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 #ifdef RCTL
 	return (rctl_pcpu_available(p));
 #else
 	return (INT64_MAX);
 #endif
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for process 'p'.
  */
 void
 racct_sub(struct proc *p, int resource, uint64_t amount)
 {
 
+	if (!racct_enable)
+		return;
+
 	SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for non-droppable resource %d", __func__, resource));
 
 	mtx_lock(&racct_lock);
 	KASSERT(amount <= p->p_racct->r_resources[resource],
 	    ("%s: freeing %ju of resource %d, which is more "
 	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
 	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
 
 	racct_alloc_resource(p->p_racct, resource, -amount);
 	racct_sub_cred_locked(p->p_ucred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
+	ASSERT_RACCT_ENABLED();
+
 	SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount,
 	    0, 0);
 
 #ifdef notyet
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for resource %d which can not drop", __func__,
 	     resource));
 #endif
 
 	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
 		    -amount);
 	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
  */
 void
 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
+	if (!racct_enable)
+		return;
+
 	mtx_lock(&racct_lock);
 	racct_sub_cred_locked(cred, resource, amount);
 	mtx_unlock(&racct_lock);
 }
 
 /*
  * Inherit resource usage information from the parent process.
  */
 int
 racct_proc_fork(struct proc *parent, struct proc *child)
 {
 	int i, error = 0;
 
+	if (!racct_enable)
+		return (0);
+
 	/*
 	 * Create racct for the child process.
 	 */
 	racct_create(&child->p_racct);
 
 	PROC_LOCK(parent);
 	PROC_LOCK(child);
 	mtx_lock(&racct_lock);
 
 #ifdef RCTL
 	error = rctl_proc_fork(parent, child);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Init process cpu time. */
 	child->p_prev_runtime = 0;
 	child->p_throttled = 0;
 
 	/*
 	 * Inherit resource usage.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (parent->p_racct->r_resources[i] == 0 ||
 		    !RACCT_IS_INHERITABLE(i))
 			continue;
 
 		error = racct_set_locked(child, i,
 		    parent->p_racct->r_resources[i]);
 		if (error != 0)
 			goto out;
 	}
 
 	error = racct_add_locked(child, RACCT_NPROC, 1);
 	error += racct_add_locked(child, RACCT_NTHR, 1);
 
 out:
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(child);
 	PROC_UNLOCK(parent);
 
 	if (error != 0)
 		racct_proc_exit(child);
 
 	return (error);
 }
 
 /*
  * Called at the end of fork1(), to handle rules that require the process
  * to be fully initialized.
  */
 void
 racct_proc_fork_done(struct proc *child)
 {
 
 #ifdef RCTL
+	if (!racct_enable)
+		return;
+
 	PROC_LOCK(child);
 	mtx_lock(&racct_lock);
 	rctl_enforce(child, RACCT_NPROC, 0);
 	rctl_enforce(child, RACCT_NTHR, 0);
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(child);
 #endif
 }
 
 void
 racct_proc_exit(struct proc *p)
 {
 	int i;
 	uint64_t runtime;
 	struct timeval wallclock;
 	uint64_t pct_estimate, pct;
 
+	if (!racct_enable)
+		return;
+
 	PROC_LOCK(p);
 	/*
 	 * We don't need to calculate rux, proc_reap() has already done this.
 	 */
 	runtime = cputick2usec(p->p_rux.rux_runtime);
 #ifdef notyet
 	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
 #else
 	if (runtime < p->p_prev_runtime)
 		runtime = p->p_prev_runtime;
 #endif
 	microuptime(&wallclock);
 	timevalsub(&wallclock, &p->p_stats->p_start);
 	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 		pct_estimate = (1000000 * runtime * 100) /
 		    ((uint64_t)wallclock.tv_sec * 1000000 +
 		    wallclock.tv_usec);
 	} else
 		pct_estimate = 0;
 	pct = racct_getpcpu(p, pct_estimate);
 
 	mtx_lock(&racct_lock);
 	racct_set_locked(p, RACCT_CPU, runtime);
 	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
 
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (p->p_racct->r_resources[i] == 0)
 			continue;
 	    	if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		racct_set_locked(p, i, 0);
 	}
 
 	mtx_unlock(&racct_lock);
 	PROC_UNLOCK(p);
 
 #ifdef RCTL
 	rctl_racct_release(p->p_racct);
 #endif
 	racct_destroy(&p->p_racct);
 }
 
 /*
  * Called after credentials change, to move resource utilisation
  * between raccts.
  */
 void
 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
     struct ucred *newcred)
 {
 	struct uidinfo *olduip, *newuip;
 	struct loginclass *oldlc, *newlc;
 	struct prison *oldpr, *newpr, *pr;
 
+	if (!racct_enable)
+		return;
+
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	newuip = newcred->cr_ruidinfo;
 	olduip = oldcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	oldlc = oldcred->cr_loginclass;
 	newpr = newcred->cr_prison;
 	oldpr = oldcred->cr_prison;
 
 	mtx_lock(&racct_lock);
 	if (newuip != olduip) {
 		racct_sub_racct(olduip->ui_racct, p->p_racct);
 		racct_add_racct(newuip->ui_racct, p->p_racct);
 	}
 	if (newlc != oldlc) {
 		racct_sub_racct(oldlc->lc_racct, p->p_racct);
 		racct_add_racct(newlc->lc_racct, p->p_racct);
 	}
 	if (newpr != oldpr) {
 		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
 			racct_sub_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
 			racct_add_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 	}
 	mtx_unlock(&racct_lock);
 
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 #endif
 }
 
 void
 racct_move(struct racct *dest, struct racct *src)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	mtx_lock(&racct_lock);
 
 	racct_add_racct(dest, src);
 	racct_sub_racct(src, src);
 
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_proc_throttle(struct proc *p)
 {
 	struct thread *td;
 #ifdef SMP
 	int cpuid;
 #endif
 
+	ASSERT_RACCT_ENABLED();
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Do not block kernel processes.  Also do not block processes with
 	 * low %cpu utilization to improve interactivity.
 	 */
 	if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
 	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
 		return;
 	p->p_throttled = 1;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		switch (td->td_state) {
 		case TDS_RUNQ:
 			/*
 			 * If the thread is on the scheduler run-queue, we can
 			 * not just remove it from there.  So we set the flag
 			 * TDF_NEEDRESCHED for the thread, so that once it is
 			 * running, it is taken off the cpu as soon as possible.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 			break;
 		case TDS_RUNNING:
 			/*
 			 * If the thread is running, we request a context
 			 * switch for it by setting the TDF_NEEDRESCHED flag.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 #ifdef SMP
 			cpuid = td->td_oncpu;
 			if ((cpuid != NOCPU) && (td != curthread))
 				ipi_cpu(cpuid, IPI_AST);
 #endif
 			break;
 		default:
 			break;
 		}
 		thread_unlock(td);
 	}
 }
 
 static void
 racct_proc_wakeup(struct proc *p)
 {
+
+	ASSERT_RACCT_ENABLED();
+
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_throttled) {
 		p->p_throttled = 0;
 		wakeup(p->p_racct);
 	}
 }
 
 static void
 racct_decay_resource(struct racct *racct, void * res, void* dummy)
 {
 	int resource;
 	int64_t r_old, r_new;
 
+	ASSERT_RACCT_ENABLED();
+
 	resource = *(int *)res;
 	r_old = racct->r_resources[resource];
 
 	/* If there is nothing to decay, just exit. */
 	if (r_old <= 0)
 		return;
 
 	mtx_lock(&racct_lock);
 	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
 	racct->r_resources[resource] = r_new;
 	mtx_unlock(&racct_lock);
 }
 
 static void
 racct_decay(int resource)
 {
+
+	ASSERT_RACCT_ENABLED();
+
 	ui_racct_foreach(racct_decay_resource, &resource, NULL);
 	loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
 	prison_racct_foreach(racct_decay_resource, &resource, NULL);
 }
 
 static void
 racctd(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct timeval wallclock;
 	uint64_t runtime;
 	uint64_t pct, pct_estimate;
 
+	ASSERT_RACCT_ENABLED();
+
 	for (;;) {
 		racct_decay(RACCT_PCTCPU);
 
 		sx_slock(&allproc_lock);
 
 		LIST_FOREACH(p, &zombproc, p_list) {
 			PROC_LOCK(p);
 			racct_set(p, RACCT_PCTCPU, 0);
 			PROC_UNLOCK(p);
 		}
 
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			microuptime(&wallclock);
 			timevalsub(&wallclock, &p->p_stats->p_start);
 			PROC_SLOCK(p);
 			FOREACH_THREAD_IN_PROC(p, td)
 				ruxagg(p, td);
 			runtime = cputick2usec(p->p_rux.rux_runtime);
 			PROC_SUNLOCK(p);
 #ifdef notyet
 			KASSERT(runtime >= p->p_prev_runtime,
 			    ("runtime < p_prev_runtime"));
 #else
 			if (runtime < p->p_prev_runtime)
 				runtime = p->p_prev_runtime;
 #endif
 			p->p_prev_runtime = runtime;
 			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 				pct_estimate = (1000000 * runtime * 100) /
 				    ((uint64_t)wallclock.tv_sec * 1000000 +
 				    wallclock.tv_usec);
 			} else
 				pct_estimate = 0;
 			pct = racct_getpcpu(p, pct_estimate);
 			mtx_lock(&racct_lock);
 			racct_set_force_locked(p, RACCT_PCTCPU, pct);
 			racct_set_locked(p, RACCT_CPU, runtime);
 			racct_set_locked(p, RACCT_WALLCLOCK,
 			    (uint64_t)wallclock.tv_sec * 1000000 +
 			    wallclock.tv_usec);
 			mtx_unlock(&racct_lock);
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * To ensure that processes are throttled in a fair way, we need
 		 * to iterate over all processes again and check the limits
 		 * for %cpu resource only after ucred racct containers have been
 		 * properly filled.
 		 */
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			if (racct_pcpu_available(p) <= 0)
 				racct_proc_throttle(p);
 			else if (p->p_throttled)
 				racct_proc_wakeup(p);
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		pause("-", hz);
 	}
 }
 
 static struct kproc_desc racctd_kp = {
 	"racctd",
 	racctd,
 	NULL
 };
-SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
 
 static void
+racctd_init(void)
+{
+	if (!racct_enable)
+		return;
+
+	kproc_start(&racctd_kp);
+}
+SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
+
+static void
 racct_init(void)
 {
+	if (!racct_enable)
+		return;
 
 	racct_zone = uma_zcreate("racct", sizeof(struct racct),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * XXX: Move this somewhere.
 	 */
 	prison0.pr_prison_racct = prison_racct_find("0");
 }
 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
 
 #else /* !RACCT */
 
 int
 racct_add(struct proc *p, int resource, uint64_t amount)
 {
 
 	return (0);
 }
 
 void
 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 }
 
 void
 racct_add_force(struct proc *p, int resource, uint64_t amount)
 {
 
 	return;
 }
 
 int
 racct_set(struct proc *p, int resource, uint64_t amount)
 {
 
 	return (0);
 }
 
 void
 racct_set_force(struct proc *p, int resource, uint64_t amount)
 {
 }
 
 void
 racct_sub(struct proc *p, int resource, uint64_t amount)
 {
 }
 
 void
 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 }
 
 uint64_t
 racct_get_limit(struct proc *p, int resource)
 {
 
 	return (UINT64_MAX);
 }
 
 uint64_t
 racct_get_available(struct proc *p, int resource)
 {
 
 	return (UINT64_MAX);
 }
 
 void
 racct_create(struct racct **racctp)
 {
 }
 
 void
 racct_destroy(struct racct **racctp)
 {
 }
 
 int
 racct_proc_fork(struct proc *parent, struct proc *child)
 {
 
 	return (0);
 }
 
 void
 racct_proc_fork_done(struct proc *child)
 {
 }
 
 void
 racct_proc_exit(struct proc *p)
 {
 }
 
 #endif /* !RACCT */
Index: stable/10/sys/kern/kern_rctl.c
===================================================================
--- stable/10/sys/kern/kern_rctl.c	(revision 284664)
+++ stable/10/sys/kern/kern_rctl.c	(revision 284665)
@@ -1,1870 +1,1939 @@
 /*-
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/loginclass.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <vm/uma.h>
 
 #ifdef RCTL
 #ifndef RACCT
 #error "The RCTL option requires the RACCT option"
 #endif
 
 FEATURE(rctl, "Resource Limits");
 
 #define	HRF_DEFAULT		0
 #define	HRF_DONT_INHERIT	1
 #define	HRF_DONT_ACCUMULATE	2
 
 /* Default buffer size for rctl_get_rules(2). */
 #define	RCTL_DEFAULT_BUFSIZE	4096
 #define	RCTL_MAX_INBUFLEN	4096
 #define	RCTL_LOG_BUFSIZE	128
 
 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
 
 /*
  * 'rctl_rule_link' connects a rule with every racct it's related to.
  * For example, rule 'user:X:openfiles:deny=N/process' is linked
  * with uidinfo for user X, and to each process of that user.
  */
 struct rctl_rule_link {
 	LIST_ENTRY(rctl_rule_link)	rrl_next;
 	struct rctl_rule		*rrl_rule;
 	int				rrl_exceeded;
 };
 
 struct dict {
 	const char	*d_name;
 	int		d_value;
 };
 
 static struct dict subjectnames[] = {
 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
 	{ "user", RCTL_SUBJECT_TYPE_USER },
 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
 	{ NULL, -1 }};
 
 static struct dict resourcenames[] = {
 	{ "cputime", RACCT_CPU },
 	{ "datasize", RACCT_DATA },
 	{ "stacksize", RACCT_STACK },
 	{ "coredumpsize", RACCT_CORE },
 	{ "memoryuse", RACCT_RSS },
 	{ "memorylocked", RACCT_MEMLOCK },
 	{ "maxproc", RACCT_NPROC },
 	{ "openfiles", RACCT_NOFILE },
 	{ "vmemoryuse", RACCT_VMEM },
 	{ "pseudoterminals", RACCT_NPTS },
 	{ "swapuse", RACCT_SWAP },
 	{ "nthr", RACCT_NTHR },
 	{ "msgqqueued", RACCT_MSGQQUEUED },
 	{ "msgqsize", RACCT_MSGQSIZE },
 	{ "nmsgq", RACCT_NMSGQ },
 	{ "nsem", RACCT_NSEM },
 	{ "nsemop", RACCT_NSEMOP },
 	{ "nshm", RACCT_NSHM },
 	{ "shmsize", RACCT_SHMSIZE },
 	{ "wallclock", RACCT_WALLCLOCK },
 	{ "pcpu", RACCT_PCTCPU },
 	{ NULL, -1 }};
 
 static struct dict actionnames[] = {
 	{ "sighup", RCTL_ACTION_SIGHUP },
 	{ "sigint", RCTL_ACTION_SIGINT },
 	{ "sigquit", RCTL_ACTION_SIGQUIT },
 	{ "sigill", RCTL_ACTION_SIGILL },
 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
 	{ "sigabrt", RCTL_ACTION_SIGABRT },
 	{ "sigemt", RCTL_ACTION_SIGEMT },
 	{ "sigfpe", RCTL_ACTION_SIGFPE },
 	{ "sigkill", RCTL_ACTION_SIGKILL },
 	{ "sigbus", RCTL_ACTION_SIGBUS },
 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
 	{ "sigsys", RCTL_ACTION_SIGSYS },
 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
 	{ "sigalrm", RCTL_ACTION_SIGALRM },
 	{ "sigterm", RCTL_ACTION_SIGTERM },
 	{ "sigurg", RCTL_ACTION_SIGURG },
 	{ "sigstop", RCTL_ACTION_SIGSTOP },
 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
 	{ "sigchld", RCTL_ACTION_SIGCHLD },
 	{ "sigttin", RCTL_ACTION_SIGTTIN },
 	{ "sigttou", RCTL_ACTION_SIGTTOU },
 	{ "sigio", RCTL_ACTION_SIGIO },
 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
 	{ "sigprof", RCTL_ACTION_SIGPROF },
 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
 	{ "siginfo", RCTL_ACTION_SIGINFO },
 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
 	{ "sigthr", RCTL_ACTION_SIGTHR },
 	{ "deny", RCTL_ACTION_DENY },
 	{ "log", RCTL_ACTION_LOG },
 	{ "devctl", RCTL_ACTION_DEVCTL },
 	{ NULL, -1 }};
 
 static void rctl_init(void);
 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
 
 static uma_zone_t rctl_rule_link_zone;
 static uma_zone_t rctl_rule_zone;
 static struct rwlock rctl_lock;
 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
 
 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
 
 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
 
 static const char *
 rctl_subject_type_name(int subject)
 {
 	int i;
 
 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
 		if (subjectnames[i].d_value == subject)
 			return (subjectnames[i].d_name);
 	}
 
 	panic("rctl_subject_type_name: unknown subject type %d", subject);
 }
 
 static const char *
 rctl_action_name(int action)
 {
 	int i;
 
 	for (i = 0; actionnames[i].d_name != NULL; i++) {
 		if (actionnames[i].d_value == action)
 			return (actionnames[i].d_name);
 	}
 
 	panic("rctl_action_name: unknown action %d", action);
 }
 
 const char *
 rctl_resource_name(int resource)
 {
 	int i;
 
 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
 		if (resourcenames[i].d_value == resource)
 			return (resourcenames[i].d_name);
 	}
 
 	panic("rctl_resource_name: unknown resource %d", resource);
 }
 
 /*
  * Return the amount of resource that can be allocated by 'p' before
  * hitting 'rule'.
  */
 static int64_t
 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
 {
 	int resource;
 	int64_t available = INT64_MAX;
 	struct ucred *cred = p->p_ucred;
 
+	ASSERT_RACCT_ENABLED();
 	rw_assert(&rctl_lock, RA_LOCKED);
 
 	resource = rule->rr_resource;
 	switch (rule->rr_per) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		available = rule->rr_amount -
 		    p->p_racct->r_resources[resource];
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		available = rule->rr_amount -
 		    cred->cr_ruidinfo->ui_racct->r_resources[resource];
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		available = rule->rr_amount -
 		    cred->cr_loginclass->lc_racct->r_resources[resource];
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		available = rule->rr_amount -
 		    cred->cr_prison->pr_prison_racct->prr_racct->
 		        r_resources[resource];
 		break;
 	default:
 		panic("rctl_compute_available: unknown per %d",
 		    rule->rr_per);
 	}
 
 	return (available);
 }
 
 /*
  * Return non-zero if allocating 'amount' by proc 'p' would exceed
  * resource limit specified by 'rule'.
  */
 static int
 rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
     int64_t amount)
 {
 	int64_t available;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_assert(&rctl_lock, RA_LOCKED);
 
 	available = rctl_available_resource(p, rule);
 	if (available >= amount)
 		return (0);
 
 	return (1);
 }
 
 /*
  * Special version of rctl_available() function for the %cpu resource.
  * We slightly cheat here and return less than we normally would.
  */
 int64_t
 rctl_pcpu_available(const struct proc *p) {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, limit;
 
+	ASSERT_RACCT_ENABLED();
+
 	minavailable = INT64_MAX;
 	limit = 0;
 
 	rw_rlock(&rctl_lock);
 
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != RACCT_PCTCPU)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable) {
 			minavailable = available;
 			limit = rule->rr_amount;
 		}
 	}
 
 	rw_runlock(&rctl_lock);
 
 	/*
 	 * Return slightly less than actual value of the available
 	 * %cpu resource.  This makes %cpu throttling more agressive
 	 * and lets us act sooner than the limits are already exceeded.
 	 */
 	if (limit != 0) {
 		if (limit > 2 * RCTL_PCPU_SHIFT)
 			minavailable -= RCTL_PCPU_SHIFT;
 		else
 			minavailable -= (limit / 2);
 	}
 
 	return (minavailable);
 }
 
 /*
  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
  * to what it keeps allocated now.  Returns non-zero if the allocation should
  * be denied, 0 otherwise.
  */
 int
 rctl_enforce(struct proc *p, int resource, uint64_t amount)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	struct sbuf sb;
 	int should_deny = 0;
 	char *buf;
 	static int curtime = 0;
 	static struct timeval lasttime;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_rlock(&rctl_lock);
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (!rctl_would_exceed(p, rule, amount)) {
 			link->rrl_exceeded = 0;
 			continue;
 		}
 
 		switch (rule->rr_action) {
 		case RCTL_ACTION_DENY:
 			should_deny = 1;
 			continue;
 		case RCTL_ACTION_LOG:
 			/*
 			 * If rrl_exceeded != 0, it means we've already
 			 * logged a warning for this process.
 			 */
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			/*
 			 * If the process state is not fully initialized yet,
 			 * we can't access most of the required fields, e.g.
 			 * p->p_comm.  This happens when called from fork1().
 			 * Ignore this rule for now; it will be processed just
 			 * after fork, when called from racct_proc_fork_done().
 			 */
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (!ppsratecheck(&lasttime, &curtime, 10))
 				continue;
 
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_finish(&sb);
 			printf("rctl: rule \"%s\" matched by pid %d "
 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		case RCTL_ACTION_DEVCTL:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 	
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
 			sbuf_printf(&sb, "rule=");
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
 			    p->p_pid, p->p_ucred->cr_ruid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_finish(&sb);
 			devctl_notify_f("RCTL", "rule", "matched",
 			    sbuf_data(&sb), M_NOWAIT);
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		default:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			KASSERT(rule->rr_action > 0 &&
 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
 			    ("rctl_enforce: unknown action %d",
 			     rule->rr_action));
 
 			/*
 			 * We're using the fact that RCTL_ACTION_SIG* values
 			 * are equal to their counterparts from sys/signal.h.
 			 */
 			kern_psignal(p, rule->rr_action);
 			link->rrl_exceeded = 1;
 			continue;
 		}
 	}
 
 	rw_runlock(&rctl_lock);
 
 	if (should_deny) {
 		/*
 		 * Return fake error code; the caller should change it
 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
 		 */
 		return (EDOOFUS);
 	}
 
 	return (0);
 }
 
 uint64_t
 rctl_get_limit(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	uint64_t amount = UINT64_MAX;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_rlock(&rctl_lock);
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		if (rule->rr_amount < amount)
 			amount = rule->rr_amount;
 	}
 
 	rw_runlock(&rctl_lock);
 
 	return (amount);
 }
 
 uint64_t
 rctl_get_available(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, allocated;
 
 	minavailable = INT64_MAX;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_rlock(&rctl_lock);
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable)
 			minavailable = available;
 	}
 
 	rw_runlock(&rctl_lock);
 
 	/*
 	 * XXX: Think about this _hard_.
 	 */
 	allocated = p->p_racct->r_resources[resource];
 	if (minavailable < INT64_MAX - allocated)
 		minavailable += allocated;
 	if (minavailable < 0)
 		minavailable = 0;
 	return (minavailable);
 }
 
 static int
 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_subject_type != filter->rr_subject_type)
 			return (0);
 
 		switch (filter->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			if (filter->rr_subject.rs_proc != NULL &&
 			    rule->rr_subject.rs_proc !=
 			    filter->rr_subject.rs_proc)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			if (filter->rr_subject.rs_uip != NULL &&
 			    rule->rr_subject.rs_uip !=
 			    filter->rr_subject.rs_uip)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (filter->rr_subject.rs_loginclass != NULL &&
 			    rule->rr_subject.rs_loginclass !=
 			    filter->rr_subject.rs_loginclass)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			if (filter->rr_subject.rs_prison_racct != NULL &&
 			    rule->rr_subject.rs_prison_racct !=
 			    filter->rr_subject.rs_prison_racct)
 				return (0);
 			break;
 		default:
 			panic("rctl_rule_matches: unknown subject type %d",
 			    filter->rr_subject_type);
 		}
 	}
 
 	if (filter->rr_resource != RACCT_UNDEFINED) {
 		if (rule->rr_resource != filter->rr_resource)
 			return (0);
 	}
 
 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
 		if (rule->rr_action != filter->rr_action)
 			return (0);
 	}
 
 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
 		if (rule->rr_amount != filter->rr_amount)
 			return (0);
 	}
 
 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_per != filter->rr_per)
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 str2value(const char *str, int *value, struct dict *table)
 {
 	int i;
 
 	if (value == NULL)
 		return (EINVAL);
 
 	for (i = 0; table[i].d_name != NULL; i++) {
 		if (strcasecmp(table[i].d_name, str) == 0) {
 			*value =  table[i].d_value;
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 static int
 str2id(const char *str, id_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 str2int64(const char *str, int64_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Connect the rule to the racct, increasing refcount for the rule.
  */
 static void
 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	rctl_rule_acquire(rule);
 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	rw_wlock(&rctl_lock);
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 	rw_wunlock(&rctl_lock);
 }
 
 static int
 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 	rw_assert(&rctl_lock, RA_WLOCKED);
 
 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
 	if (link == NULL)
 		return (ENOMEM);
 	rctl_rule_acquire(rule);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 	return (0);
 }
 
 /*
  * Remove limits for a rules matching the filter and release
  * the refcounts for the rules, possibly freeing them.  Returns
  * the number of limit structures removed.
  */
 static int
 rctl_racct_remove_rules(struct racct *racct,
     const struct rctl_rule *filter)
 {
 	int removed = 0;
 	struct rctl_rule_link *link, *linktmp;
 
+	ASSERT_RACCT_ENABLED();
 	rw_assert(&rctl_lock, RA_WLOCKED);
 
 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 		removed++;
 	}
 	return (removed);
 }
 
 static void
 rctl_rule_acquire_subject(struct rctl_rule *rule)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uihold(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_hold(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_acquire_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 static void
 rctl_rule_release_subject(struct rctl_rule *rule)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_free(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uifree(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_free(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_release_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 struct rctl_rule *
 rctl_rule_alloc(int flags)
 {
 	struct rctl_rule *rule;
 
+	ASSERT_RACCT_ENABLED();
+
 	rule = uma_zalloc(rctl_rule_zone, flags);
 	if (rule == NULL)
 		return (NULL);
 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_subject.rs_proc = NULL;
 	rule->rr_subject.rs_uip = NULL;
 	rule->rr_subject.rs_loginclass = NULL;
 	rule->rr_subject.rs_prison_racct = NULL;
 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_resource = RACCT_UNDEFINED;
 	rule->rr_action = RCTL_ACTION_UNDEFINED;
 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	refcount_init(&rule->rr_refcount, 1);
 
 	return (rule);
 }
 
 struct rctl_rule *
 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
 {
 	struct rctl_rule *copy;
 
+	ASSERT_RACCT_ENABLED();
+
 	copy = uma_zalloc(rctl_rule_zone, flags);
 	if (copy == NULL)
 		return (NULL);
 	copy->rr_subject_type = rule->rr_subject_type;
 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
 	copy->rr_per = rule->rr_per;
 	copy->rr_resource = rule->rr_resource;
 	copy->rr_action = rule->rr_action;
 	copy->rr_amount = rule->rr_amount;
 	refcount_init(&copy->rr_refcount, 1);
 	rctl_rule_acquire_subject(copy);
 
 	return (copy);
 }
 
 void
 rctl_rule_acquire(struct rctl_rule *rule)
 {
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	refcount_acquire(&rule->rr_refcount);
 }
 
 static void
 rctl_rule_free(void *context, int pending)
 {
 	struct rctl_rule *rule;
 	
 	rule = (struct rctl_rule *)context;
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
 	
 	/*
 	 * We don't need locking here; rule is guaranteed to be inaccessible.
 	 */
 	
 	rctl_rule_release_subject(rule);
 	uma_zfree(rctl_rule_zone, rule);
 }
 
 void
 rctl_rule_release(struct rctl_rule *rule)
 {
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	if (refcount_release(&rule->rr_refcount)) {
 		/*
 		 * rctl_rule_release() is often called when iterating
 		 * over all the uidinfo structures in the system,
 		 * holding uihashtbl_lock.  Since rctl_rule_free()
 		 * might end up calling uifree(), this would lead
 		 * to lock recursion.  Use taskqueue to avoid this.
 		 */
 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
 	}
 }
 
 static int
 rctl_rule_fully_specified(const struct rctl_rule *rule)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 		return (0);
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
 			return (0);
 		break;
 	default:
 		panic("rctl_rule_fully_specified: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 	if (rule->rr_resource == RACCT_UNDEFINED)
 		return (0);
 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
 		return (0);
 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
 		return (0);
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
 		return (0);
 
 	return (1);
 }
 
 static int
 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
 {
 	int error = 0;
 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
 	     *amountstr, *perstr;
 	struct rctl_rule *rule;
 	id_t id;
 
+	ASSERT_RACCT_ENABLED();
+
 	rule = rctl_rule_alloc(M_WAITOK);
 
 	subjectstr = strsep(&rulestr, ":");
 	subject_idstr = strsep(&rulestr, ":");
 	resourcestr = strsep(&rulestr, ":");
 	actionstr = strsep(&rulestr, "=/");
 	amountstr = strsep(&rulestr, "/");
 	perstr = rulestr;
 
 	if (subjectstr == NULL || subjectstr[0] == '\0')
 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
 		rule->rr_subject.rs_proc = NULL;
 		rule->rr_subject.rs_uip = NULL;
 		rule->rr_subject.rs_loginclass = NULL;
 		rule->rr_subject.rs_prison_racct = NULL;
 	} else {
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_UNDEFINED:
 			error = EINVAL;
 			goto out;
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			sx_assert(&allproc_lock, SA_LOCKED);
 			rule->rr_subject.rs_proc = pfind(id);
 			if (rule->rr_subject.rs_proc == NULL) {
 				error = ESRCH;
 				goto out;
 			}
 			PROC_UNLOCK(rule->rr_subject.rs_proc);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			rule->rr_subject.rs_uip = uifind(id);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			rule->rr_subject.rs_loginclass =
 			    loginclass_find(subject_idstr);
 			if (rule->rr_subject.rs_loginclass == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			rule->rr_subject.rs_prison_racct =
 			    prison_racct_find(subject_idstr);
 			if (rule->rr_subject.rs_prison_racct == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
                default:
                        panic("rctl_string_to_rule: unknown subject type %d",
                            rule->rr_subject_type);
                }
 	}
 
 	if (resourcestr == NULL || resourcestr[0] == '\0')
 		rule->rr_resource = RACCT_UNDEFINED;
 	else {
 		error = str2value(resourcestr, &rule->rr_resource,
 		    resourcenames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (actionstr == NULL || actionstr[0] == '\0')
 		rule->rr_action = RCTL_ACTION_UNDEFINED;
 	else {
 		error = str2value(actionstr, &rule->rr_action, actionnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (amountstr == NULL || amountstr[0] == '\0')
 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	else {
 		error = str2int64(amountstr, &rule->rr_amount);
 		if (error != 0)
 			goto out;
 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
 			rule->rr_amount *= 1000000;
 	}
 
 	if (perstr == NULL || perstr[0] == '\0')
 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(perstr, &rule->rr_per, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 out:
 	if (error == 0)
 		*rulep = rule;
 	else
 		rctl_rule_release(rule);
 
 	return (error);
 }
 
 /*
  * Link a rule with all the subjects it applies to.
  */
 int
 rctl_rule_add(struct rctl_rule *rule)
 {
 	struct proc *p;
 	struct ucred *cred;
 	struct uidinfo *uip;
 	struct prison *pr;
 	struct prison_racct *prr;
 	struct loginclass *lc;
 	struct rctl_rule *rule2;
 	int match;
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	/*
 	 * Some rules just don't make sense.  Note that the one below
 	 * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
 	 * for example, is not deniable in the racct sense, but the
 	 * limit is enforced in a different way, so "deny" rules for %CPU
 	 * do make sense.
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY &&
 	    (rule->rr_resource == RACCT_CPU ||
 	    rule->rr_resource == RACCT_WALLCLOCK))
 		return (EOPNOTSUPP);
 
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
 	    RACCT_IS_SLOPPY(rule->rr_resource))
 		return (EOPNOTSUPP);
 
 	/*
 	 * Make sure there are no duplicated rules.  Also, for the "deny"
 	 * rules, remove ones differing only by "amount".
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY) {
 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
 		rctl_rule_remove(rule2);
 		rctl_rule_release(rule2);
 	} else
 		rctl_rule_remove(rule);
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = rule->rr_subject.rs_proc;
 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
 
 		rctl_racct_add_rule(p->p_racct, rule);
 		/*
 		 * In case of per-process rule, we don't have anything more
 		 * to do.
 		 */
 		return (0);
 
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = rule->rr_subject.rs_uip;
 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
 		rctl_racct_add_rule(uip->ui_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = rule->rr_subject.rs_loginclass;
 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
 		rctl_racct_add_rule(lc->lc_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = rule->rr_subject.rs_prison_racct;
 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
 		rctl_racct_add_rule(prr->prr_racct, rule);
 		break;
 
 	default:
 		panic("rctl_rule_add: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	/*
 	 * Now go through all the processes and add the new rule to the ones
 	 * it applies to.
 	 */
 	sx_assert(&allproc_lock, SA_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		cred = p->p_ucred;
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_USER:
 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			match = 0;
 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
 					match = 1;
 					break;
 				}
 			}
 			if (match)
 				break;
 			continue;
 		default:
 			panic("rctl_rule_add: unknown subject type %d",
 			    rule->rr_subject_type);
 		}
 
 		rctl_racct_add_rule(p->p_racct, rule);
 	}
 
 	return (0);
 }
 
 static void
 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	int found = 0;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_wlock(&rctl_lock);
 	found += rctl_racct_remove_rules(racct, filter);
 	rw_wunlock(&rctl_lock);
 
 	*((int *)arg3) += found;
 }
 
 /*
  * Remove all rules that match the filter.
  */
 int
 rctl_rule_remove(struct rctl_rule *filter)
 {
 	int found = 0;
 	struct proc *p;
 
+	ASSERT_RACCT_ENABLED();
+
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
 	    filter->rr_subject.rs_proc != NULL) {
 		p = filter->rr_subject.rs_proc;
 		rw_wlock(&rctl_lock);
 		found = rctl_racct_remove_rules(p->p_racct, filter);
 		rw_wunlock(&rctl_lock);
 		if (found)
 			return (0);
 		return (ESRCH);
 	}
 
 	loginclass_racct_foreach(rctl_rule_remove_callback, filter,
 	    (void *)&found);
 	ui_racct_foreach(rctl_rule_remove_callback, filter,
 	    (void *)&found);
 	prison_racct_foreach(rctl_rule_remove_callback, filter,
 	    (void *)&found);
 
 	sx_assert(&allproc_lock, SA_LOCKED);
 	rw_wlock(&rctl_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		found += rctl_racct_remove_rules(p->p_racct, filter);
 	}
 	rw_wunlock(&rctl_lock);
 
 	if (found)
 		return (0);
 	return (ESRCH);
 }
 
 /*
  * Appends a rule to the sbuf.
  */
 static void
 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
 {
 	int64_t amount;
 
+	ASSERT_RACCT_ENABLED();
+
 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_proc->p_pid);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_uip->ui_uid);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_loginclass->lc_name);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_prison_racct->prr_name);
 		break;
 	default:
 		panic("rctl_rule_to_sbuf: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	amount = rule->rr_amount;
 	if (amount != RCTL_AMOUNT_UNDEFINED &&
 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
 		amount /= 1000000;
 
 	sbuf_printf(sb, "%s:%s=%jd",
 	    rctl_resource_name(rule->rr_resource),
 	    rctl_action_name(rule->rr_action),
 	    amount);
 
 	if (rule->rr_per != rule->rr_subject_type)
 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
 }
 
 /*
  * Routine used by RCTL syscalls to read in input string.
  */
 static int
 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
 {
 	int error;
 	char *str;
 
+	ASSERT_RACCT_ENABLED();
+
 	if (inbuflen <= 0)
 		return (EINVAL);
 	if (inbuflen > RCTL_MAX_INBUFLEN)
 		return (E2BIG);
 
 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
 	error = copyinstr(inbufp, str, inbuflen, NULL);
 	if (error != 0) {
 		free(str, M_RCTL);
 		return (error);
 	}
 
 	*inputstr = str;
 
 	return (0);
 }
 
 /*
  * Routine used by RCTL syscalls to write out output string.
  */
 static int
 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
 {
 	int error;
 
+	ASSERT_RACCT_ENABLED();
+
 	if (outputsbuf == NULL)
 		return (0);
 
 	sbuf_finish(outputsbuf);
 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
 		sbuf_delete(outputsbuf);
 		return (ERANGE);
 	}
 	error = copyout(sbuf_data(outputsbuf), outbufp,
 	    sbuf_len(outputsbuf) + 1);
 	sbuf_delete(outputsbuf);
 	return (error);
 }
 
 static struct sbuf *
 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
 {
 	int i;
 	int64_t amount;
 	struct sbuf *sb;
 
+	ASSERT_RACCT_ENABLED();
+
 	sb = sbuf_new_auto();
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
 			continue;
 		amount = racct->r_resources[i];
 		if (RACCT_IS_IN_MILLIONS(i))
 			amount /= 1000000;
 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
 	}
 	sbuf_setpos(sb, sbuf_len(sb) - 1);
 	return (sb);
 }
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 	int error;
 	char *inputstr;
 	struct rctl_rule *filter;
 	struct sbuf *outputsbuf = NULL;
 	struct proc *p;
 	struct uidinfo *uip;
 	struct loginclass *lc;
 	struct prison_racct *prr;
 
+	if (!racct_enable)
+		return (ENOSYS);
+
 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	switch (filter->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = filter->rr_subject.rs_proc;
 		if (p == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = filter->rr_subject.rs_uip;
 		if (uip == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = filter->rr_subject.rs_loginclass;
 		if (lc == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = filter->rr_subject.rs_prison_racct;
 		if (prr == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
 		break;
 	default:
 		error = EINVAL;
 	}
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	if (error != 0)
 		return (error);
 
 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
 
 	return (error);
 }
 
 static void
 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	struct rctl_rule_link *link;
 	struct sbuf *sb = (struct sbuf *)arg3;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_rlock(&rctl_lock);
 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
 		sbuf_printf(sb, ",");
 	}
 	rw_runlock(&rctl_lock);
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 	int error;
 	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
 	char *inputstr, *buf;
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 	struct proc *p;
 
+	if (!racct_enable)
+		return (ENOSYS);
+
 	error = priv_check(td, PRIV_RCTL_GET_RULES);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 again:
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	sx_assert(&allproc_lock, SA_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		rw_rlock(&rctl_lock);
 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 			/*
 			 * Non-process rules will be added to the buffer later.
 			 * Adding them here would result in duplicated output.
 			 */
 			if (link->rrl_rule->rr_subject_type !=
 			    RCTL_SUBJECT_TYPE_PROCESS)
 				continue;
 			if (!rctl_rule_matches(link->rrl_rule, filter))
 				continue;
 			rctl_rule_to_sbuf(sb, link->rrl_rule);
 			sbuf_printf(sb, ",");
 		}
 		rw_runlock(&rctl_lock);
 	}
 
 	loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
 	ui_racct_foreach(rctl_get_rules_callback, filter, sb);
 	prison_racct_foreach(rctl_get_rules_callback, filter, sb);
 	if (sbuf_error(sb) == ENOMEM) {
 		sbuf_delete(sb);
 		free(buf, M_RCTL);
 		bufsize *= 4;
 		goto again;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 	int error;
 	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
 	char *inputstr, *buf;
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 
+	if (!racct_enable)
+		return (ENOSYS);
+
 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EOPNOTSUPP);
 	}
 	if (filter->rr_subject.rs_proc == NULL) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 
 again:
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	rw_rlock(&rctl_lock);
 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
 	    rrl_next) {
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
 		sbuf_printf(sb, ",");
 	}
 	rw_runlock(&rctl_lock);
 	if (sbuf_error(sb) == ENOMEM) {
 		sbuf_delete(sb);
 		free(buf, M_RCTL);
 		bufsize *= 4;
 		goto again;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 	int error;
 	struct rctl_rule *rule;
 	char *inputstr;
 
+	if (!racct_enable)
+		return (ENOSYS);
+
 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &rule);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 	/*
 	 * The 'per' part of a rule is optional.
 	 */
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
 		rule->rr_per = rule->rr_subject_type;
 
 	if (!rctl_rule_fully_specified(rule)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = rctl_rule_add(rule);
 
 out:
 	rctl_rule_release(rule);
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 	int error;
 	struct rctl_rule *filter;
 	char *inputstr;
 
+	if (!racct_enable)
+		return (ENOSYS);
+
 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	error = rctl_rule_remove(filter);
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 
 	return (error);
 }
 
 /*
  * Update RCTL rule list after credential change.
  */
 void
 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
 {
 	int rulecnt, i;
 	struct rctl_rule_link *link, *newlink;
 	struct uidinfo *newuip;
 	struct loginclass *newlc;
 	struct prison_racct *newprr;
 	LIST_HEAD(, rctl_rule_link) newrules;
 
+	ASSERT_RACCT_ENABLED();
+
 	newuip = newcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	newprr = newcred->cr_prison->pr_prison_racct;
 	
 	LIST_INIT(&newrules);
 
 again:
 	/*
 	 * First, count the rules that apply to the process with new
 	 * credentials.
 	 */
 	rulecnt = 0;
 	rw_rlock(&rctl_lock);
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS)
 			rulecnt++;
 	}
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	rw_runlock(&rctl_lock);
 
 	/*
 	 * Create temporary list.  We've dropped the rctl_lock in order
 	 * to use M_WAITOK.
 	 */
 	for (i = 0; i < rulecnt; i++) {
 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 		newlink->rrl_rule = NULL;
 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
 	}
 
 	newlink = LIST_FIRST(&newrules);
 
 	/*
 	 * Assign rules to the newly allocated list entries.
 	 */
 	rw_wlock(&rctl_lock);
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			if (newlink == NULL)
 				goto goaround;
 			rctl_rule_acquire(link->rrl_rule);
 			newlink->rrl_rule = link->rrl_rule;
 			newlink = LIST_NEXT(newlink, rrl_next);
 			rulecnt--;
 		}
 	}
 	
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	if (rulecnt == 0) {
 		/*
 		 * Free the old rule list.
 		 */
 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
 			link = LIST_FIRST(&p->p_racct->r_rule_links);
 			LIST_REMOVE(link, rrl_next);
 			rctl_rule_release(link->rrl_rule);
 			uma_zfree(rctl_rule_link_zone, link);
 		}
 
 		/*
 		 * Replace lists and we're done.
 		 *
 		 * XXX: Is there any way to switch list heads instead
 		 *      of iterating here?
 		 */
 		while (!LIST_EMPTY(&newrules)) {
 			newlink = LIST_FIRST(&newrules);
 			LIST_REMOVE(newlink, rrl_next);
 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
 			    newlink, rrl_next);
 		}
 
 		rw_wunlock(&rctl_lock);
 
 		return;
 	}
 
 goaround:
 	rw_wunlock(&rctl_lock);
 
 	/*
 	 * Rule list changed while we were not holding the rctl_lock.
 	 * Free the new list and try again.
 	 */
 	while (!LIST_EMPTY(&newrules)) {
 		newlink = LIST_FIRST(&newrules);
 		LIST_REMOVE(newlink, rrl_next);
 		if (newlink->rrl_rule != NULL)
 			rctl_rule_release(newlink->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, newlink);
 	}
 
 	goto again;
 }
 
 /*
  * Assign RCTL rules to the newly created process.
  */
 int
 rctl_proc_fork(struct proc *parent, struct proc *child)
 {
 	int error;
 	struct rctl_rule_link *link;
 	struct rctl_rule *rule;
 
 	LIST_INIT(&child->p_racct->r_rule_links);
 
+	ASSERT_RACCT_ENABLED();
 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
 
 	rw_wlock(&rctl_lock);
 
 	/*
 	 * Go through limits applicable to the parent and assign them
 	 * to the child.  Rules with 'process' subject have to be duplicated
 	 * in order to make their rr_subject point to the new process.
 	 */
 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
 			if (rule == NULL)
 				goto fail;
 			KASSERT(rule->rr_subject.rs_proc == parent,
 			    ("rule->rr_subject.rs_proc != parent"));
 			rule->rr_subject.rs_proc = child;
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    rule);
 			rctl_rule_release(rule);
 			if (error != 0)
 				goto fail;
 		} else {
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    link->rrl_rule);
 			if (error != 0)
 				goto fail;
 		}
 	}
 
 	rw_wunlock(&rctl_lock);
 	return (0);
 
 fail:
 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
 		link = LIST_FIRST(&child->p_racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 	rw_wunlock(&rctl_lock);
 	return (EAGAIN);
 }
 
 /*
  * Release rules attached to the racct.
  */
 void
 rctl_racct_release(struct racct *racct)
 {
 	struct rctl_rule_link *link;
 
+	ASSERT_RACCT_ENABLED();
+
 	rw_wlock(&rctl_lock);
 	while (!LIST_EMPTY(&racct->r_rule_links)) {
 		link = LIST_FIRST(&racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 	rw_wunlock(&rctl_lock);
 }
 
 static void
 rctl_init(void)
 {
+
+	if (!racct_enable)
+		return;
 
 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 #else /* !RCTL */
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 #endif /* !RCTL */
Index: stable/10/sys/kern/kern_thr.c
===================================================================
--- stable/10/sys/kern/kern_thr.c	(revision 284664)
+++ stable/10/sys/kern/kern_thr.c	(revision 284665)
@@ -1,559 +1,561 @@
 /*-
  * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
 #include <sys/umtx.h>
 #include <sys/limits.h>
 
 #include <machine/frame.h>
 
 #include <security/audit/audit.h>
 
 static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
     "thread allocation");
 
 static int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
     &max_threads_per_proc, 0, "Limit on threads per proc");
 
 static int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
     &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count");
 
 #ifdef COMPAT_FREEBSD32
 
 static inline int
 suword_lwpid(void *addr, lwpid_t lwpid)
 {
 	int error;
 
 	if (SV_CURPROC_FLAG(SV_LP64))
 		error = suword(addr, lwpid);
 	else
 		error = suword32(addr, lwpid);
 	return (error);
 }
 
 #else
 #define suword_lwpid	suword
 #endif
 
 static int create_thread(struct thread *td, mcontext_t *ctx,
 			 void (*start_func)(void *), void *arg,
 			 char *stack_base, size_t stack_size,
 			 char *tls_base,
 			 long *child_tid, long *parent_tid,
 			 int flags, struct rtprio *rtp);
 
 /*
  * System call interface.
  */
 int
 sys_thr_create(struct thread *td, struct thr_create_args *uap)
     /* ucontext_t *ctx, long *id, int flags */
 {
 	ucontext_t ctx;
 	int error;
 
 	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
 		return (error);
 
 	error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
 		NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
 	return (error);
 }
 
 int
 sys_thr_new(struct thread *td, struct thr_new_args *uap)
     /* struct thr_param * */
 {
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 || uap->param_size > sizeof(param))
 		return (EINVAL);
 	bzero(&param, sizeof(param));
 	if ((error = copyin(uap->param, &param, uap->param_size)))
 		return (error);
 	return (kern_thr_new(td, &param));
 }
 
 int
 kern_thr_new(struct thread *td, struct thr_param *param)
 {
 	struct rtprio rtp, *rtpp;
 	int error;
 
 	rtpp = NULL;
 	if (param->rtp != 0) {
 		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
 		if (error)
 			return (error);
 		rtpp = &rtp;
 	}
 	error = create_thread(td, NULL, param->start_func, param->arg,
 		param->stack_base, param->stack_size, param->tls_base,
 		param->child_tid, param->parent_tid, param->flags,
 		rtpp);
 	return (error);
 }
 
 static int
 create_thread(struct thread *td, mcontext_t *ctx,
 	    void (*start_func)(void *), void *arg,
 	    char *stack_base, size_t stack_size,
 	    char *tls_base,
 	    long *child_tid, long *parent_tid,
 	    int flags, struct rtprio *rtp)
 {
 	stack_t stack;
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 
 	/* Have race condition but it is cheap. */
 	if (p->p_numthreads >= max_threads_per_proc) {
 		++max_threads_hits;
 		return (EPROCLIM);
 	}
 
 	if (rtp != NULL) {
 		switch(rtp->type) {
 		case RTP_PRIO_REALTIME:
 		case RTP_PRIO_FIFO:
 			/* Only root can set scheduler policy */
 			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
 				return (EPERM);
 			if (rtp->prio > RTP_PRIO_MAX)
 				return (EINVAL);
 			break;
 		case RTP_PRIO_NORMAL:
 			rtp->prio = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 #ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	error = racct_add(p, RACCT_NTHR, 1);
 	PROC_UNLOCK(td->td_proc);
 	if (error != 0)
 		return (EPROCLIM);
 #endif
 
 	/* Initialize our td */
 	newtd = thread_alloc(0);
 	if (newtd == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 
 	cpu_set_upcall(newtd, td);
 
 	/*
 	 * Try the copyout as soon as we allocate the td so we don't
 	 * have to tear things down in a failure case below.
 	 * Here we copy out tid to two places, one for child and one
 	 * for parent, because pthread can create a detached thread,
 	 * if parent wants to safely access child tid, it has to provide 
 	 * its storage, because child thread may exit quickly and
 	 * memory is freed before parent thread can access it.
 	 */
 	if ((child_tid != NULL &&
 	    suword_lwpid(child_tid, newtd->td_tid)) ||
 	    (parent_tid != NULL &&
 	    suword_lwpid(parent_tid, newtd->td_tid))) {
 		thread_free(newtd);
 		error = EFAULT;
 		goto fail;
 	}
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	newtd->td_su = NULL;
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
 	newtd->td_ucred = crhold(td->td_ucred);
 
 	if (ctx != NULL) { /* old way to set user context */
 		error = set_mcontext(newtd, ctx);
 		if (error != 0) {
 			thread_free(newtd);
 			crfree(td->td_ucred);
 			goto fail;
 		}
 	} else {
 		/* Set up our machine context. */
 		stack.ss_sp = stack_base;
 		stack.ss_size = stack_size;
 		/* Set upcall address to user thread entry function. */
 		cpu_set_upcall_kse(newtd, start_func, arg, &stack);
 		/* Setup user TLS address and TLS pointer register. */
 		error = cpu_set_user_tls(newtd, tls_base);
 		if (error != 0) {
 			thread_free(newtd);
 			crfree(td->td_ucred);
 			goto fail;
 		}
 	}
 
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
 	thread_link(newtd, p); 
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	if (P_SHOULDSTOP(p))
 		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 	thread_lock(newtd);
 	if (rtp != NULL) {
 		if (!(td->td_pri_class == PRI_TIMESHARE &&
 		      rtp->type == RTP_PRIO_NORMAL)) {
 			rtp_to_pri(rtp, newtd);
 			sched_prio(newtd, newtd->td_user_pri);
 		} /* ignore timesharing class */
 	}
 	TD_SET_CAN_RUN(newtd);
 	sched_add(newtd, SRQ_BORING);
 	thread_unlock(newtd);
 
 	return (0);
 
 fail:
 #ifdef RACCT
-	PROC_LOCK(p);
-	racct_sub(p, RACCT_NTHR, 1);
-	PROC_UNLOCK(p);
+	if (racct_enable) {
+		PROC_LOCK(p);
+		racct_sub(p, RACCT_NTHR, 1);
+		PROC_UNLOCK(p);
+	}
 #endif
 	return (error);
 }
 
 int
 sys_thr_self(struct thread *td, struct thr_self_args *uap)
     /* long *id */
 {
 	int error;
 
 	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
 	if (error == -1)
 		return (EFAULT);
 	return (0);
 }
 
 int
 sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
     /* long *state */
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/* Signal userland that it can free the stack. */
 	if ((void *)uap->state != NULL) {
 		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX, 0);
 	}
 
 	rw_wlock(&tidhash_lock);
 
 	PROC_LOCK(p);
 
 	if (p->p_numthreads != 1) {
 		racct_sub(p, RACCT_NTHR, 1);
 		LIST_REMOVE(td, td_hash);
 		rw_wunlock(&tidhash_lock);
 		tdsigcleanup(td);
 		umtx_thread_exit(td);
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 
 	/*
 	 * Ignore attempts to shut down last thread in the proc.  This
 	 * will actually call _exit(2) in the usermode trampoline when
 	 * it returns.
 	 */
 	PROC_UNLOCK(p);
 	rw_wunlock(&tidhash_lock);
 	return (0);
 }
 
 int
 sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
     /* long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = p->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			PROC_LOCK(p);
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 			PROC_UNLOCK(p);
 		}
 	} else {
 		error = 0;
 		ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else 
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(ttd->td_proc);
 	}
 	return (error);
 }
 
 int
 sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
     /* pid_t pid, long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->sig);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if ((p = pfind(uap->pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 	} else {
 		ttd = tdfind((lwpid_t)uap->id, uap->pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		p = ttd->td_proc;
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(p);
 	}
 	return (error);
 }
 
 int
 sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 	/* const struct timespec *timeout */
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = umtx_copyin_timeout(uap->timeout, &ts);
 		if (error != 0)
 			return (error);
 		tsp = &ts;
 	}
 
 	return (kern_thr_suspend(td, tsp));
 }
 
 int
 kern_thr_suspend(struct thread *td, struct timespec *tsp)
 {
 	struct proc *p = td->td_proc;
 	struct timeval tv;
 	int error = 0;
 	int timo = 0;
 
 	if (td->td_pflags & TDP_WAKEUP) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		return (0);
 	}
 
 	if (tsp != NULL) {
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			error = EWOULDBLOCK;
 		else {
 			TIMESPEC_TO_TIMEVAL(&tv, tsp);
 			timo = tvtohz(&tv);
 		}
 	}
 
 	PROC_LOCK(p);
 	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
 		error = msleep((void *)td, &p->p_mtx,
 			 PCATCH, "lthr", timo);
 
 	if (td->td_flags & TDF_THRWAKEUP) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	PROC_UNLOCK(p);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	else if (error == ERESTART) {
 		if (timo != 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 int
 sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
 	/* long id */
 {
 	struct proc *p;
 	struct thread *ttd;
 
 	if (uap->id == td->td_tid) {
 		td->td_pflags |= TDP_WAKEUP;
 		return (0);
 	} 
 
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
 	thread_unlock(ttd);
 	wakeup((void *)ttd);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
 {
 	struct proc *p;
 	char name[MAXCOMLEN + 1];
 	struct thread *ttd;
 	int error;
 
 	error = 0;
 	name[0] = '\0';
 	if (uap->name != NULL) {
 		error = copyinstr(uap->name, name, sizeof(name),
 			NULL);
 		if (error)
 			return (error);
 	}
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	strcpy(ttd->td_name, name);
 #ifdef KTR
 	sched_clear_tdname(ttd);
 #endif
 	PROC_UNLOCK(p);
 	return (error);
 }
Index: stable/10/sys/kern/sched_4bsd.c
===================================================================
--- stable/10/sys/kern/sched_4bsd.c	(revision 284664)
+++ stable/10/sys/kern/sched_4bsd.c	(revision 284665)
@@ -1,1785 +1,1785 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 #include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int				dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 
 /*
  * The schedulable entity that runs a context.
  * This is  an extension to the thread structure and is tailored to
  * the requirements of this scheduler
  */
 struct td_sched {
 	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
 	int		ts_cpticks;	/* (j) Ticks of cpu time. */
 	int		ts_slptime;	/* (j) Seconds !RUNNING. */
 	int		ts_slice;	/* Remaining part of time slice. */
 	int		ts_flags;
 	struct runq	*ts_runq;	/* runq the thread is currently on */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 
 /* flags kept in td_flags */
 #define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /* flags kept in ts_flags */
 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
 
 #define SKE_RUNQ_PCPU(ts)						\
     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 static struct td_sched td_sched0;
 struct mtx sched_lock;
 
 static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_slice = 12; /* Thread run time before rescheduling. */
 
 static void	setup_runqs(void);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_priority(struct thread *td, u_char prio);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct thread *td);
 static void	resetpriority(struct thread *td);
 static void	resetpriority_thread(struct thread *td);
 #ifdef SMP
 static int	sched_pickcpu(struct thread *td);
 static int	forward_wakeup(int cpunum);
 static void	kick_other_cpu(int pri, int cpuid);
 #endif
 
 static struct kproc_desc sched_kp = {
         "schedcpu",
         schedcpu_thread,
         NULL
 };
 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
     &sched_kp);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 
 #ifdef SMP
 /*
  * Per-CPU run queues
  */
 static struct runq runq_pcpu[MAXCPU];
 long runq_length[MAXCPU];
 
 static cpuset_t idle_cpus_mask;
 #endif
 
 struct pcpuidlestat {
 	u_int idlecalls;
 	u_int oldidlecalls;
 };
 static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
 
 static void
 setup_runqs(void)
 {
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < MAXCPU; ++i)
 		runq_init(&runq_pcpu[i]);
 #endif
 
 	runq_init(&runq);
 }
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
 
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 #ifdef SMP
 /* Enable forwarding of wakeups to all other cpus */
 static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
     "Kernel SMP");
 
 static int runq_fuzz = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 
 static int forward_wakeup_enabled = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 	   &forward_wakeup_enabled, 0,
 	   "Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_requested = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 	   &forward_wakeups_requested, 0,
 	   "Requests for Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_delivered = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 	   &forward_wakeups_delivered, 0,
 	   "Completed Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeup_use_mask = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 	   &forward_wakeup_use_mask, 0,
 	   "Use the mask of idle cpus");
 
 static int forward_wakeup_use_loop = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 	   &forward_wakeup_use_loop, 0,
 	   "Use a loop to find idle cpus");
 
 #endif
 #if 0
 static int sched_followon = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 	   &sched_followon, 0,
 	   "allow threads to share a quantum");
 #endif
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
     "struct proc *");
 
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
 sched_load_rem(void)
 {
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * This function is called when a thread is about to be put on run queue
  * because it has been made runnable or its priority has been adjusted.  It
  * determines if the new thread should be immediately preempted to.  If so,
  * it switches to it and eventually returns true.  If not, it returns false
  * so that the caller may place the thread on an appropriate run queue.
  */
 int
 maybe_preempt(struct thread *td)
 {
 #ifdef PREEMPTION
 	struct thread *ctd;
 	int cpri, pri;
 
 	/*
 	 * The new thread should not preempt the current thread if any of the
 	 * following conditions are true:
 	 *
 	 *  - The kernel is in the throes of crashing (panicstr).
 	 *  - The current thread has a higher (numerically lower) or
 	 *    equivalent priority.  Note that this prevents curthread from
 	 *    trying to preempt to itself.
 	 *  - It is too early in the boot for context switches (cold is set).
 	 *  - The current thread has an inhibitor set or is in the process of
 	 *    exiting.  In this case, the current thread is about to switch
 	 *    out anyways, so there's no point in preempting.  If we did,
 	 *    the current thread would not be properly resumed as well, so
 	 *    just avoid that whole landmine.
 	 *  - If the new thread's priority is not a realtime priority and
 	 *    the current thread's priority is not an idle priority and
 	 *    FULL_PREEMPTION is disabled.
 	 *
 	 * If all of these conditions are false, but the current thread is in
 	 * a nested critical section, then we have to defer the preemption
 	 * until we exit the critical section.  Otherwise, switch immediately
 	 * to the new thread.
 	 */
 	ctd = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 			("maybe_preempt: trying to run inhibited thread"));
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
 	    TD_IS_INHIBITED(ctd))
 		return (0);
 #ifndef FULL_PREEMPTION
 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
 		return (0);
 #endif
 
 	if (ctd->td_critnest > 1) {
 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
 		    ctd->td_critnest);
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
 	MPASS(ctd->td_lock == td->td_lock);
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_name);
 	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
 	/*
 	 * td's lock pointer may have changed.  We have to return with it
 	 * locked.
 	 */
 	spinlock_enter();
 	thread_unlock(ctd);
 	thread_lock(td);
 	spinlock_exit();
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (td_estcpu) usage in 5 * loadav time
  *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		td_estcpu *= decay;
  * will compute
  * 	td_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct td_sched *ts;
 	int awake;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			awake = 0;
 			thread_lock(td);
 			ts = td->td_sched;
 			/*
 			 * Increment sleep time (if sleeping).  We
 			 * ignore overflow, as above.
 			 */
 			/*
 			 * The td_sched slptimes are not touched in wakeup
 			 * because the thread may not HAVE everything in
 			 * memory? XXX I think this is out of date.
 			 */
 			if (TD_ON_RUNQ(td)) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			} else if (TD_IS_RUNNING(td)) {
 				awake = 1;
 				/* Do not clear TDF_DIDRUN */
 			} else if (td->td_flags & TDF_DIDRUN) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			}
 
 			/*
 			 * ts_pctcpu is only for ps and ttyinfo().
 			 */
 			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 			/*
 			 * If the td_sched has been idle the entire second,
 			 * stop recalculating its priority until
 			 * it wakes up.
 			 */
 			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 				ts->ts_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ts->ts_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ts->ts_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ts->ts_pctcpu += ((FSCALE - ccpu) *
 				    (ts->ts_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ts->ts_cpticks = 0;
 			}
 			/*
 			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
 			 * XXX: this is broken.
 			 */
 			if (awake) {
 				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(td);
 				}
 				ts->ts_slptime = 0;
 			} else
 				ts->ts_slptime++;
 			if (ts->ts_slptime > 1) {
 				thread_unlock(td);
 				continue;
 			}
 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Main loop for a kthread that executes schedcpu once a second.
  */
 static void
 schedcpu_thread(void)
 {
 
 	for (;;) {
 		schedcpu();
 		pause("-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay td_estcpu to zero.
  */
 static void
 updatepri(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t loadfac;
 	unsigned int newcpu;
 
 	ts = td->td_sched;
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (ts->ts_slptime > 5 * loadfac)
 		td->td_estcpu = 0;
 	else {
 		newcpu = td->td_estcpu;
 		ts->ts_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		td->td_estcpu = newcpu;
 	}
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct thread *td)
 {
 	register unsigned int newpriority;
 
 	if (td->td_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		sched_user_prio(td, newpriority);
 	}
 }
 
 /*
  * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
 resetpriority_thread(struct thread *td)
 {
 
 	/* Only change threads with a time sharing user priority. */
 	if (td->td_priority < PRI_MIN_TIMESHARE ||
 	    td->td_priority > PRI_MAX_TIMESHARE)
 		return;
 
 	/* XXX the whole needresched thing is broken, but not silly. */
 	maybe_resched(td);
 
 	sched_prio(td, td->td_user_pri);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	setup_runqs();
 
 	/* Account for thread0. */
 	sched_load_add();
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 static void
 sched_initticks(void *dummy)
 {
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / 10;	/* ~100ms */
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 }
 
 /* External interfaces start here */
 
 /*
  * Very early in the boot some setup of scheduler-specific
  * parts of proc0 and of some scheduler resources needs to be done.
  * Called from:
  *  proc0_init()
  */
 void
 schedinit(void)
 {
 	/*
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
 	thread0.td_lock = &sched_lock;
 	td_sched0.ts_slice = sched_slice;
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
 sched_runnable(void)
 {
 #ifdef SMP
 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 #else
 	return runq_check(&runq);
 #endif
 }
 
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (td_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time td_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct pcpuidlestat *stat;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 
 	ts->ts_cpticks++;
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
 	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(td);
 		resetpriority_thread(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
 		ts->ts_slice = sched_slice;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 
 	stat = DPCPU_PTR(idlestat);
 	stat->oldidlecalls = stat->idlecalls;
 	stat->idlecalls = 0;
 }
 
 /*
  * Charge child's scheduling CPU usage to parent.
  */
 void
 sched_exit(struct proc *p, struct thread *td)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
 	    "prio:%d", td->td_priority);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
 	    "prio:%d", child->td_priority);
 	thread_lock(td);
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
 	thread_unlock(td);
 	thread_lock(child);
 	if ((child->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 	thread_unlock(child);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
 	sched_fork_thread(td, childtd);
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	struct td_sched *ts;
 
 	childtd->td_estcpu = td->td_estcpu;
 	childtd->td_lock = &sched_lock;
 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
 	childtd->td_priority = childtd->td_base_pri;
 	ts = childtd->td_sched;
 	bzero(ts, sizeof(*ts));
 	ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY);
 	ts->ts_slice = 1;
 }
 
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
 		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  */
 static void
 sched_priority(struct thread *td, u_char prio)
 {
 
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
 	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 	}
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regulary priority is less
  * important than prio the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_prio(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't ever
 	 * lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, int pri)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptick = ticks;
 	td->td_sched->ts_slptime = 0;
 	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 }
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct mtx *tmtx;
 	struct td_sched *ts;
 	struct proc *p;
 	int preempted;
 
 	tmtx = NULL;
 	ts = td->td_sched;
 	p = td->td_proc;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * Switch to the sched lock to fix things up and pick
 	 * a new thread.
 	 * Block the td_lock in order to avoid breaking the critical path.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		tmtx = thread_lock_block(td);
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 
 	td->td_lastcpu = td->td_oncpu;
 	preempted = !((td->td_flags & TDF_SLICEEND) ||
 	    (flags & SWT_RELINQUISH));
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	td->td_oncpu = NOCPU;
 
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
 	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
 #ifdef SMP
 		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	} else {
 		if (TD_IS_RUNNING(td)) {
 			/* Put us back on the run queue. */
 			sched_add(td, preempted ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
 		}
 	}
 	if (newtd) {
 		/*
 		 * The thread we are about to run needs to be counted
 		 * as if it had been added to the run queue and selected.
 		 * It came from:
 		 * * A preemption
 		 * * An upcall
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
 			("trying to run inhibited thread"));
 		newtd->td_flags |= TDF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_flags & TDF_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 		MPASS(newtd->td_lock == &sched_lock);
 	}
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
 		    0, 0, __FILE__, __LINE__);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
 		 * but any amount of time may have passed. All our context
 		 * will still be available as will local variables.
 		 * PCPU values however may have changed as we may have
 		 * changed CPU so don't trust cached values of them.
 		 * New threads will go to fork_exit() instead of here
 		 * so if you change things here you may need to change
 		 * things there too.
 		 *
 		 * If the thread above was exiting it will never wake
 		 * up again here, so either it has saved everything it
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else
 		SDT_PROBE0(sched, , , remain__cpu);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
 		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	td->td_flags &= ~TDF_CANSWAP;
 	if (ts->ts_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
 	}
 	td->td_slptick = 0;
 	ts->ts_slptime = 0;
 	ts->ts_slice = sched_slice;
 	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
 static int
 forward_wakeup(int cpunum)
 {
 	struct pcpu *pc;
 	cpuset_t dontuse, map, map2;
 	u_int id, me;
 	int iscpuset;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	CTR0(KTR_RUNQ, "forward_wakeup()");
 
 	if ((!forward_wakeup_enabled) ||
 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
 		return (0);
 	if (!smp_started || cold || panicstr)
 		return (0);
 
 	forward_wakeups_requested++;
 
 	/*
 	 * Check the idle mask we received against what we calculated
 	 * before in the old version.
 	 */
 	me = PCPU_GET(cpuid);
 
 	/* Don't bother if we should be doing it ourself. */
 	if (CPU_ISSET(me, &idle_cpus_mask) &&
 	    (cpunum == NOCPU || me == cpunum))
 		return (0);
 
 	CPU_SETOF(me, &dontuse);
 	CPU_OR(&dontuse, &stopped_cpus);
 	CPU_OR(&dontuse, &hlt_cpus_mask);
 	CPU_ZERO(&map2);
 	if (forward_wakeup_use_loop) {
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &dontuse) &&
 			    pc->pc_curthread == pc->pc_idlethread) {
 				CPU_SET(id, &map2);
 			}
 		}
 	}
 
 	if (forward_wakeup_use_mask) {
 		map = idle_cpus_mask;
 		CPU_NAND(&map, &dontuse);
 
 		/* If they are both on, compare and use loop if different. */
 		if (forward_wakeup_use_loop) {
 			if (CPU_CMP(&map, &map2)) {
 				printf("map != map2, loop method preferred\n");
 				map = map2;
 			}
 		}
 	} else {
 		map = map2;
 	}
 
 	/* If we only allow a specific CPU, then mask off all the others. */
 	if (cpunum != NOCPU) {
 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
 		iscpuset = CPU_ISSET(cpunum, &map);
 		if (iscpuset == 0)
 			CPU_ZERO(&map);
 		else
 			CPU_SETOF(cpunum, &map);
 	}
 	if (!CPU_EMPTY(&map)) {
 		forward_wakeups_delivered++;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &map))
 				continue;
 			if (cpu_idle_wakeup(pc->pc_cpuid))
 				CPU_CLR(id, &map);
 		}
 		if (!CPU_EMPTY(&map))
 			ipi_selected(map, IPI_AST);
 		return (1);
 	}
 	if (cpunum == NOCPU)
 		printf("forward_wakeup: Idle processor not found\n");
 	return (0);
 }
 
 static void
 kick_other_cpu(int pri, int cpuid)
 {
 	struct pcpu *pcpu;
 	int cpri;
 
 	pcpu = pcpu_find(cpuid);
 	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
 		forward_wakeups_delivered++;
 		if (!cpu_idle_wakeup(cpuid))
 			ipi_cpu(cpuid, IPI_AST);
 		return;
 	}
 
 	cpri = pcpu->pc_curthread->td_priority;
 	if (pri >= cpri)
 		return;
 
 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
 #if !defined(FULL_PREEMPTION)
 	if (pri <= PRI_MAX_ITHD)
 #endif /* ! FULL_PREEMPTION */
 	{
 		ipi_cpu(cpuid, IPI_PREEMPT);
 		return;
 	}
 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
 
 	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
 	ipi_cpu(cpuid, IPI_AST);
 	return;
 }
 #endif /* SMP */
 
 #ifdef SMP
 static int
 sched_pickcpu(struct thread *td)
 {
 	int best, cpu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
 		best = td->td_lastcpu;
 	else
 		best = NOCPU;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu))
 			continue;
 	
 		if (best == NOCPU)
 			best = cpu;
 		else if (runq_length[cpu] < runq_length[best])
 			best = cpu;
 	}
 	KASSERT(best != NOCPU, ("no valid CPUs"));
 
 	return (best);
 }
 #endif
 
 void
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
 	cpuset_t tidlemsk;
 	struct td_sched *ts;
 	u_int cpu, cpuid;
 	int forwarded = 0;
 	int single_cpu = 0;
 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 
 	/*
 	 * If SMP is started and the thread is pinned or otherwise limited to
 	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
 	 * Otherwise, queue the thread to the global run queue.
 	 *
 	 * If SMP has not yet been started we must use the global run queue
 	 * as per-CPU state may not be initialized yet and we may crash if we
 	 * try to access the per-CPU run queues.
 	 */
 	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
 	    ts->ts_flags & TSF_AFFINITY)) {
 		if (td->td_pinned != 0)
 			cpu = td->td_lastcpu;
 		else if (td->td_flags & TDF_BOUND) {
 			/* Find CPU from bound runq. */
 			KASSERT(SKE_RUNQ_PCPU(ts),
 			    ("sched_add: bound td_sched not on cpu runq"));
 			cpu = ts->ts_runq - &runq_pcpu[0];
 		} else
 			/* Find a valid CPU for our cpuset */
 			cpu = sched_pickcpu(td);
 		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
 		    cpu);
 	} else {
 		CTR2(KTR_RUNQ,
 		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
 		    td);
 		cpu = NOCPU;
 		ts->ts_runq = &runq;
 	}
 
 	cpuid = PCPU_GET(cpuid);
 	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
 	} else {
 		if (!single_cpu) {
 			tidlemsk = idle_cpus_mask;
 			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
 			CPU_CLR(cpuid, &tidlemsk);
 
 			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
 			    ((flags & SRQ_INTR) == 0) &&
 			    !CPU_EMPTY(&tidlemsk))
 				forwarded = forward_wakeup(cpu);
 		}
 
 		if (!forwarded) {
 			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
 				return;
 			else
 				maybe_resched(td);
 		}
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	if (cpu != NOCPU)
 		runq_length[cpu]++;
 }
 #else /* SMP */
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
 
 	/*
 	 * If we are yielding (on the way out anyhow) or the thread
 	 * being saved is US, then don't try be smart about preemption
 	 * or kicking off another CPU as it won't help and may hinder.
 	 * In the YIEDLING case, we are about to run whoever is being
 	 * put in the queue anyhow, and in the OURSELF case, we are
 	 * puting ourself on the run queue which also only happens
 	 * when we are about to yield.
 	 */
 	if ((flags & SRQ_YIELDING) == 0) {
 		if (maybe_preempt(td))
 			return;
 	}
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	maybe_resched(td);
 }
 #endif /* SMP */
 
 void
 sched_rem(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_rem: thread swapped out"));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 #ifdef SMP
 	if (ts->ts_runq != &runq)
 		runq_length[ts->ts_runq - runq_pcpu]--;
 #endif
 	runq_remove(ts->ts_runq, td);
 	TD_SET_CAN_RUN(td);
 }
 
 /*
  * Select threads to run.  Note that running threads still consume a
  * slot.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct runq *rq;
 
 	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct thread *tdcpu;
 
 	rq = &runq;
 	td = runq_choose_fuzz(&runq, runq_fuzz);
 	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
 	if (td == NULL ||
 	    (tdcpu != NULL &&
 	     tdcpu->td_priority < td->td_priority)) {
 		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
 		     PCPU_GET(cpuid));
 		td = tdcpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else {
 		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
 	}
 
 #else
 	rq = &runq;
 	td = runq_choose(&runq);
 #endif
 
 	if (td) {
 #ifdef SMP
 		if (td == tdcpu)
 			runq_length[PCPU_GET(cpuid)]--;
 #endif
 		runq_remove(rq, td);
 		td->td_flags |= TDF_DIDRUN;
 
 		KASSERT(td->td_flags & TDF_INMEM,
 		    ("sched_choose: thread swapped out"));
 		return (td);
 	}
 	return (PCPU_GET(idlethread));
 }
 
 void
 sched_preempt(struct thread *td)
 {
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
 	else
 		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
 	thread_unlock(td);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
 		thread_unlock(td);
 	}
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 
 	ts = td->td_sched;
 
 	td->td_flags |= TDF_BOUND;
 #ifdef SMP
 	ts->ts_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
 	mi_switch(SW_VOL, NULL);
 #endif
 }
 
 void
 sched_unbind(struct thread* td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	td->td_flags &= ~TDF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_flags & TDF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 int
 sched_load(void)
 {
 	return (sched_tdcnt);
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	return (ts->ts_pctcpu);
 }
 
-#ifdef	RACCT
+#ifdef RACCT
 /*
  * Calculates the contribution to the thread cpu usage for the latest
  * (unfinished) second.
  */
 fixpt_t
 sched_pctcpu_delta(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t delta;
 	int realstathz;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	delta = 0;
 	realstathz = stathz ? stathz : hz;
 	if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 		delta = (realstathz == 100)
 		    ? ((fixpt_t) ts->ts_cpticks) <<
 		    (FSHIFT - CCPU_SHIFT) :
 		    100 * (((fixpt_t) ts->ts_cpticks)
 		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 		delta = ((FSCALE - ccpu) *
 		    (ts->ts_cpticks *
 		    FSCALE / realstathz)) >> FSHIFT;
 #endif
 	}
 
 	return (delta);
 }
 #endif
 
 void
 sched_tick(int cnt)
 {
 }
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct pcpuidlestat *stat;
 
 	THREAD_NO_SLEEPING();
 	stat = DPCPU_PTR(idlestat);
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		while (sched_runnable() == 0) {
 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
 			stat->idlecalls++;
 		}
 
 		mtx_lock_spin(&sched_lock);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	/*
 	 * Correct spinlock nesting.  The idle thread context that we are
 	 * borrowing was created so that it would start out with a single
 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
 	 * explicitly acquired locks in this function, the nesting count
 	 * is now 2 rather than 1.  Since we are nested, calling
 	 * spinlock_exit() will simply adjust the counts without allowing
 	 * spin lock using code to interrupt us.
 	 */
 	if (td == NULL) {
 		mtx_lock_spin(&sched_lock);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		lock_profile_release_lock(&sched_lock.lock_object);
 		MPASS(td->td_lock == &sched_lock);
 	}
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	cpu_throw(td, choosethread());	/* doesn't return */
 }
 
 void
 sched_fork_exit(struct thread *td)
 {
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td->td_oncpu = PCPU_GET(cpuid);
 	sched_lock.mtx_lock = (uintptr_t)td;
 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
 	    0, 0, __FILE__, __LINE__);
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 }
 
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else   
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td->td_sched;
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 	int cpu;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);	
 
 	/*
 	 * Set the TSF_AFFINITY flag if there is at least one CPU this
 	 * thread can't run on.
 	 */
 	ts = td->td_sched;
 	ts->ts_flags &= ~TSF_AFFINITY;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu)) {
 			ts->ts_flags |= TSF_AFFINITY;
 			break;
 		}
 	}
 
 	/*
 	 * If this thread can run on all CPUs, nothing else to do.
 	 */
 	if (!(ts->ts_flags & TSF_AFFINITY))
 		return;
 
 	/* Pinned threads and bound threads should be left alone. */
 	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
 		return;
 
 	switch (td->td_state) {
 	case TDS_RUNQ:
 		/*
 		 * If we are on a per-CPU runqueue that is in the set,
 		 * then nothing needs to be done.
 		 */
 		if (ts->ts_runq != &runq &&
 		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
 			return;
 
 		/* Put this thread on a valid per-CPU runqueue. */
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		break;
 	case TDS_RUNNING:
 		/*
 		 * See if our current CPU is in the set.  If not, force a
 		 * context switch.
 		 */
 		if (THREAD_CAN_SCHED(td, td->td_oncpu))
 			return;
 
 		td->td_flags |= TDF_NEEDRESCHED;
 		if (td != curthread)
 			ipi_cpu(cpu, IPI_AST);
 		break;
 	default:
 		break;
 	}
 #endif
 }
Index: stable/10/sys/kern/subr_trap.c
===================================================================
--- stable/10/sys/kern/subr_trap.c	(revision 284664)
+++ stable/10/sys/kern/subr_trap.c	(revision 284665)
@@ -1,310 +1,313 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007 The FreeBSD Foundation
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pmckern.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 #include <security/audit/audit.h>
 
 #include <machine/cpu.h>
 
 #ifdef VIMAGE
 #include <net/vnet.h>
 #endif
 
 #ifdef XEN
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #endif
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 void (*softdep_ast_cleanup)(void);
 
 /*
  * Define the code needed before returning to user mode, for trap and
  * syscall.
  */
 void
 userret(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p = td->td_proc;
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             td->td_name);
 	KASSERT((p->p_flag & P_WEXIT) == 0,
 	    ("Exiting process returns to usermode"));
 #if 0
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
 	thread_lock(td);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 #endif
 #endif
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
 	if (softdep_ast_cleanup != NULL)
 		softdep_ast_cleanup();
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL)
 		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 	/*
 	 * Let the scheduler adjust our priority etc.
 	 */
 	sched_userret(td);
 #ifdef XEN
 	PT_UPDATES_FLUSH();
 #endif
 
 	/*
 	 * Check for misbehavior.
 	 *
 	 * In case there is a callchain tracing ongoing because of
 	 * hwpmc(4), skip the scheduler pinning check.
 	 * hwpmc(4) subsystem, infact, will collect callchain informations
 	 * at ast() checkpoint, which is past userret().
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
 	KASSERT(td->td_critnest == 0,
 	    ("userret: Returning in a critical section"));
 	KASSERT(td->td_locks == 0,
 	    ("userret: Returning with %d locks held", td->td_locks));
 	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
 	    ("userret: Returning with pagefaults disabled"));
 	KASSERT(td->td_no_sleeping == 0,
 	    ("userret: Returning with sleep disabled"));
 	KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0,
 	    ("userret: Returning with with pinned thread"));
 	KASSERT(td->td_vp_reserv == 0,
 	    ("userret: Returning while holding vnode reservation"));
 	KASSERT((td->td_flags & TDF_SBDRY) == 0,
 	    ("userret: Returning with stop signals deferred"));
 	KASSERT(td->td_su == NULL,
 	    ("userret: Returning with SU cleanup request not handled"));
 #ifdef VIMAGE
 	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
 	VNET_ASSERT(curvnet == NULL,
 	    ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
 	    __func__, td, p->p_pid, td->td_name, curvnet,
 	    (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
 #endif
-#ifdef	RACCT
-	PROC_LOCK(p);
-	while (p->p_throttled == 1)
-		msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
-	PROC_UNLOCK(p);
+#ifdef RACCT
+	if (racct_enable) {
+		PROC_LOCK(p);
+		while (p->p_throttled == 1)
+			msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+		PROC_UNLOCK(p);
+	}
 #endif
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  * This function will return with preemption disabled.
  */
 void
 ast(struct trapframe *framep)
 {
 	struct thread *td;
 	struct proc *p;
 	int flags;
 	int sig;
 
 	td = curthread;
 	p = td->td_proc;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	td->td_frame = framep;
 	td->td_pticks = 0;
 
 	/*
 	 * This updates the td_flag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
 	 * AST's saved in flags, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
 	thread_lock(td);
 	flags = td->td_flags;
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
 	    TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND);
 	thread_unlock(td);
 	PCPU_INC(cnt.v_trap);
 
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
 		addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
 		td->td_profil_ticks = 0;
 		td->td_pflags &= ~TDP_OWEUPC;
 	}
 #ifdef HWPMC_HOOKS
 	/* Handle Software PMC callchain capture. */
 	if (PMC_IS_PENDING_CALLCHAIN(td))
 		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep);
 #endif
 	if (flags & TDF_ALRMPEND) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
 	}
 	if (flags & TDF_PROFPEND) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
 #ifdef MAC
 	if (flags & TDF_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1, __func__);
 #endif
 		thread_lock(td);
 		sched_prio(td, td->td_user_pri);
 		mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL);
 		thread_unlock(td);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1, __func__);
 #endif
 	}
 
 	/*
 	 * Check for signals. Unlocked reads of p_pendingcnt or
 	 * p_siglist might cause process-directed signal to be handled
 	 * later.
 	 */
 	if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
 	    !SIGISEMPTY(p->p_siglist)) {
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 	/*
 	 * We need to check to see if we have to exit or wait due to a
 	 * single threading requirement or some other STOP condition.
 	 */
 	if (flags & TDF_NEEDSUSPCHK) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_pflags & TDP_OLDMASK) {
 		td->td_pflags &= ~TDP_OLDMASK;
 		kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
 	}
 
 	userret(td, framep);
 }
 
 const char *
 syscallname(struct proc *p, u_int code)
 {
 	static const char unknown[] = "unknown";
 	struct sysentvec *sv;
 
 	sv = p->p_sysent;
 	if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
 		return (unknown);
 	return (sv->sv_syscallnames[code]);
 }
Index: stable/10/sys/kern/sysv_msg.c
===================================================================
--- stable/10/sys/kern/sysv_msg.c	(revision 284664)
+++ stable/10/sys/kern/sysv_msg.c	(revision 284665)
@@ -1,1592 +1,1596 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/msg.h>
 #include <sys/racct.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_msg, "System V message queues support");
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static int msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
 
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)	(void)0
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
 
 static struct syscall_helper_data msg_syscalls[] = {
 	SYSCALL_INIT_HELPER(msgctl),
 	SYSCALL_INIT_HELPER(msgget),
 	SYSCALL_INIT_HELPER(msgsnd),
 	SYSCALL_INIT_HELPER(msgrcv),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(msgsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data msg32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
 	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
 	SYSCALL32_INIT_HELPER_COMPAT(msgget),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 msginit()
 {
 	int i, error;
 
 	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
 	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
 	TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
 	TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
 
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK);
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_sysvmsg_init(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_sysvmsq_init(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 
 	error = syscall_helper_register(msg_syscalls);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(msg32_syscalls);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	syscall_helper_unregister(msg_syscalls);
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(msg32_syscalls);
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		/*
 		 * Look for an unallocated and unlocked msqid_ds.
 		 * msqid_ds's can be locked by msgsnd or msgrcv while
 		 * they are copying the message in/out.  We can't
 		 * re-use the entry until they release it.
 		 */
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_sysvmsg_destroy(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_sysvmsq_destroy(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = msginit();
 		if (error != 0)
 			msgunload();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 static void
 msg_freehdr(msghdr)
 	struct msg *msghdr;
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_sysvmsg_cleanup(msghdr);
 #endif
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
 int
 sys_msgctl(td, uap)
 	struct thread *td;
 	register struct msgctl_args *uap;
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(td, msqid, cmd, msqbuf)
 	struct thread *td;
 	int msqid;
 	int cmd;
 	struct msqid_ds *msqbuf;
 {
 	int rval, error, msqix;
 	register struct msqid_kernel *msqkptr;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 
 	case IPC_RMID:
 	{
 		struct msg *msghdr;
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
 		racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
 		racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
 		racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
 		crfree(msqkptr->cred);
 		msqkptr->cred = NULL;
 
 		/* Free the message headers */
 		msghdr = msqkptr->u.msg_first;
 		while (msghdr != NULL) {
 			struct msg *msghdr_tmp;
 
 			/* Free the segments of each message */
 			msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 			msqkptr->u.msg_qnum--;
 			msghdr_tmp = msghdr;
 			msghdr = msghdr->msg_next;
 			msg_freehdr(msghdr_tmp);
 		}
 
 		if (msqkptr->u.msg_cbytes != 0)
 			panic("msg_cbytes is screwed up");
 		if (msqkptr->u.msg_qnum != 0)
 			panic("msg_qnum is screwed up");
 
 		msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
 
 #ifdef MAC
 		mac_sysvmsq_cleanup(msqkptr);
 #endif
 
 		wakeup(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check(td, PRIV_IPC_MSGSIZE);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
 
 int
 sys_msgget(td, uap)
 	struct thread *td;
 	register struct msgget_args *uap;
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	register struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvmsq_check_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
-		PROC_UNLOCK(td->td_proc);
-		if (error != 0) {
-			error = ENOSPC;
-			goto done2;
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
+			PROC_UNLOCK(td->td_proc);
+			if (error != 0) {
+				error = ENOSPC;
+				goto done2;
+			}
 		}
 #endif
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		msqkptr->cred = crhold(cred);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.msg_first = NULL;
 		msqkptr->u.msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_sysvmsq_create(cred, msqkptr);
 #endif
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
 int
 kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	const void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	int msgflg;
 	long mtype;
 {
 	int msqix, segs_needed, error = 0;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	short next;
 #ifdef RACCT
 	size_t saved_msgsz;
 #endif
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
+			PROC_UNLOCK(td->td_proc);
+			error = EAGAIN;
+			goto done2;
+		}
+		saved_msgsz = msgsz;
+		if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
+			racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
+			PROC_UNLOCK(td->td_proc);
+			error = EAGAIN;
+			goto done2;
+		}
 		PROC_UNLOCK(td->td_proc);
-		error = EAGAIN;
-		goto done2;
 	}
-	saved_msgsz = msgsz;
-	if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
-		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
-		PROC_UNLOCK(td->td_proc);
-		error = EAGAIN;
-		goto done2;
-	}
-	PROC_UNLOCK(td->td_proc);
 #endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done3;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done3;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done3;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done3;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done3;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done3;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done3;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_sysvmsq_check_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done3;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.msg_first == NULL) {
 		msqkptr->u.msg_first = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	} else {
 		msqkptr->u.msg_last->msg_next = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	}
 	msqkptr->u.msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done3:
 #ifdef RACCT
-	if (error != 0) {
+	if (racct_enable && error != 0) {
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgsnd(td, uap)
 	struct thread *td;
 	register struct msgsnd_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
 int
 kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	long msgtyp;
 	int msgflg;
 	long *mtype;
 {
 	size_t len;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	int msqix, error = 0;
 	short next;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_sysvmsq_check_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
 					msqkptr->u.msg_last = NULL;
 				} else {
 					msqkptr->u.msg_first = msghdr->msg_next;
 					if (msqkptr->u.msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_sysvmsq_check_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #2");
 							msqkptr->u.msg_first =
 							    NULL;
 							msqkptr->u.msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #3");
 							msqkptr->u.msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgrcv(td, uap)
 	struct thread *td;
 	register struct msgrcv_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, msqids,
 	    sizeof(struct msqid_kernel) * msginfo.msgmni));
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_msqids, "", "Message queue IDs");
 
 #ifdef COMPAT_FREEBSD32
 int
 freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_msgctl(td,
 		    (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
 	case 2:
 		return (freebsd32_msgsnd(td,
 		    (struct freebsd32_msgsnd_args *)&uap->a2));
 	case 3:
 		return (freebsd32_msgrcv(td,
 		    (struct freebsd32_msgrcv_args *)&uap->a2));
 	default:
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_msgctl(struct thread *td,
     struct freebsd7_freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32_old msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqbuf32, sizeof(msqbuf32));
 		freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32 msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 
 int
 freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
 {
 	const void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
 		return (error);
 	mtype = mtype32;
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)msgp + sizeof(mtype32),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 int
 freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
 {
 	void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)msgp + sizeof(mtype32), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	mtype32 = (int32_t)mtype;
 	return (copyout(&mtype32, msgp, sizeof(mtype32)));
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
 	(sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
 };
 
 /*
  * Entry point for all MSG calls.
  */
 int
 sys_msgsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct msgsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 		int	a6;
 	} */ *uap;
 {
 	int error;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds_old *buf;
 };
 #endif
 int
 freebsd7_msgctl(td, uap)
 	struct thread *td;
 	struct freebsd7_msgctl_args *uap;
 {
 	struct msqid_ds_old msqold;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
 	    uap->buf));
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqold, sizeof(msqold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
 		CP(msqold, msqbuf, msg_first);
 		CP(msqold, msqbuf, msg_last);
 		CP(msqold, msqbuf, msg_cbytes);
 		CP(msqold, msqbuf, msg_qnum);
 		CP(msqold, msqbuf, msg_qbytes);
 		CP(msqold, msqbuf, msg_lspid);
 		CP(msqold, msqbuf, msg_lrpid);
 		CP(msqold, msqbuf, msg_stime);
 		CP(msqold, msqbuf, msg_rtime);
 		CP(msqold, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqold, sizeof(msqold));
 		ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
 		CP(msqbuf, msqold, msg_first);
 		CP(msqbuf, msqold, msg_last);
 		CP(msqbuf, msqold, msg_cbytes);
 		CP(msqbuf, msqold, msg_qnum);
 		CP(msqbuf, msqold, msg_qbytes);
 		CP(msqbuf, msqold, msg_lspid);
 		CP(msqbuf, msqold, msg_lrpid);
 		CP(msqbuf, msqold, msg_stime);
 		CP(msqbuf, msqold, msg_rtime);
 		CP(msqbuf, msqold, msg_ctime);
 		error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
 	}
 	return (error);
 }
 
 #undef CP
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
Index: stable/10/sys/kern/sysv_sem.c
===================================================================
--- stable/10/sys/kern/sysv_sem.c	(revision 284664)
+++ stable/10/sys/kern/sysv_sem.c	(revision 284665)
@@ -1,1666 +1,1671 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/sem.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_sem, "System V semaphores support");
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static int seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
 static int semvalid(int semid, struct semid_kernel *semakptr);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
     int semid, int semseq, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static struct mtx sem_undo_mtx;
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 LIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 LIST_HEAD(, sem_undo) semu_free_list;	/* list of free undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
 
 #define SEMUNDO_MTX		sem_undo_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	LIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 		unsigned short un_seq;
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	50		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	340		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	50		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	150		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
 
 /* actual size of an undo structure */
 #define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
                 SEMMSL,         /* max # of semaphores per id */
                 SEMOPM,         /* max # of operations per semop call */
                 SEMUME,         /* max # of undo entries per process */
                 SEMUSZ,         /* size in bytes of undo structure */
                 SEMVMX,         /* semaphore maximum value */
                 SEMAEM          /* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_sema, "", "Semaphore id pool");
 
 static struct syscall_helper_data sem_syscalls[] = {
 	SYSCALL_INIT_HELPER(__semctl),
 	SYSCALL_INIT_HELPER(semget),
 	SYSCALL_INIT_HELPER(semop),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(semsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data sem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_semctl),
 	SYSCALL32_INIT_HELPER_COMPAT(semget),
 	SYSCALL32_INIT_HELPER_COMPAT(semop),
 	SYSCALL32_INIT_HELPER(freebsd32_semsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 seminit(void)
 {
 	int i, error;
 
 	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
 	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
 	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
 	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
 	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
 	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
 	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
 	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
 	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_sysvsem_init(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	LIST_INIT(&semu_free_list);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 		LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	}
 	LIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
 	error = syscall_helper_register(sem_syscalls);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sem32_syscalls);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	/* XXXKIB */
 	if (semtot != 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(sem32_syscalls);
 #endif
 	syscall_helper_unregister(sem_syscalls);
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_sysvsem_destroy(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	free(sema_mtx, M_SEM);
 	mtx_destroy(&sem_mtx);
 	mtx_destroy(&sem_undo_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = seminit();
 		if (error != 0)
 			semunload();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(struct thread *td)
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
 		return (NULL);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
 	suptr->un_cnt = 0;
 	suptr->un_proc = td->td_proc;
 	return (suptr);
 }
 
 static int
 semu_try_free(struct sem_undo *suptr)
 {
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 
 	if (suptr->un_cnt != 0)
 		return (0);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	return (1);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
     int semseq, int semnum, int adjval)
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		LIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return (ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 			if (suptr->un_cnt == 0)
 				semu_try_free(suptr);
 		}
 		return (0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return (0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid;
 		sunptr->un_num = semnum;
 		sunptr->un_seq = semseq;
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 semundo_clear(int semid, int semnum)
 {
 	struct sem_undo *suptr, *suptr1;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
 		sunptr = &suptr->un_ent[0];
 		for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 			if (sunptr->un_id != semid)
 				continue;
 			if (semnum == -1 || sunptr->un_num == semnum) {
 				suptr->un_cnt--;
 				if (i < suptr->un_cnt) {
 					suptr->un_ent[i] =
 					    suptr->un_ent[suptr->un_cnt];
 					continue;
 				}
 				semu_try_free(suptr);
 			}
 			if (semnum != -1)
 				break;
 		}
 	}
 }
 
 static int
 semvalid(int semid, struct semid_kernel *semakptr)
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
 }
 
 /*
  * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
 int
 sys___semctl(struct thread *td, struct __semctl_args *uap)
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	if (cmd == IPC_RMID)
 		mtx_lock(&sem_mtx);
 	mtx_lock(sema_mtxp);
 #ifdef MAC
 	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		semakptr->u.sem_perm.cuid = cred->cr_uid;
 		semakptr->u.sem_perm.uid = cred->cr_uid;
 		semakptr->u.sem_perm.mode = 0;
 		racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
 		crfree(semakptr->cred);
 		semakptr->cred = NULL;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 #ifdef MAC
 		mac_sysvsem_cleanup(semakptr);
 #endif
 		wakeup(semakptr);
 		for (i = 0; i < seminfo.semmni; i++) {
 			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[i].u.sem_base > semakptr->u.sem_base)
 				mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
 		}
 		for (i = semakptr->u.sem_base - sem; i < semtot; i++)
 			sem[i] = sem[i + semakptr->u.sem_nsems];
 		for (i = 0; i < seminfo.semmni; i++) {
 			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[i].u.sem_base > semakptr->u.sem_base) {
 				sema[i].u.sem_base -= semakptr->u.sem_nsems;
 				mtx_unlock(&sema_mtx[i]);
 			}
 		}
 		semtot -= semakptr->u.sem_nsems;
 		break;
 
 	case IPC_SET:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		break;
 
 	case GETNCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		if (error)
 			break;
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (cmd == IPC_RMID)
 		mtx_unlock(&sem_mtx);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
 int
 sys_semget(struct thread *td, struct semget_args *uap)
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	mtx_lock(&sem_mtx);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			DPRINTF(("found public key\n"));
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvsem_check_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		error = racct_add(td->td_proc, RACCT_NSEM, nsems);
-		PROC_UNLOCK(td->td_proc);
-		if (error != 0) {
-			error = ENOSPC;
-			goto done2;
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			error = racct_add(td->td_proc, RACCT_NSEM, nsems);
+			PROC_UNLOCK(td->td_proc);
+			if (error != 0) {
+				error = ENOSPC;
+				goto done2;
+			}
 		}
 #endif
 		DPRINTF(("semid %d is available\n", semid));
 		mtx_lock(&sema_mtx[semid]);
 		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
 		    ("Lost semaphore %d", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].cred = crhold(cred);
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.sem_base,
 		    sizeof(sema[semid].u.sem_base[0])*nsems);
 #ifdef MAC
 		mac_sysvsem_create(cred, &sema[semid]);
 #endif
 		mtx_unlock(&sema_mtx[semid]);
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&sem_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
 int
 sys_semop(struct thread *td, struct semop_args *uap)
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = 0;
 	struct sem *semptr = 0;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 	unsigned short seq;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops > seminfo.semopm) {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	} else {
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			if (nsops >
+			    racct_get_available(td->td_proc, RACCT_NSEMOP)) {
+				PROC_UNLOCK(td->td_proc);
+				return (E2BIG);
+			}
 			PROC_UNLOCK(td->td_proc);
-			return (E2BIG);
 		}
-		PROC_UNLOCK(td->td_proc);
 #endif
 
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	seq = semakptr->u.sem_perm.seq;
 	if (seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
 	/*
 	 * Initial pass thru sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		seq = semakptr->u.sem_perm.seq;
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * Renew the semaphore's pointer after wakeup since
 		 * during msleep sem_base may have been modified and semptr
 		 * is not valid any more
 		 */
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid, seq,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid, seq,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_SEM);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(void *arg, struct proc *p)
 {
 	struct sem_undo *suptr;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	int semid, semnum, adjval, ix;
 	unsigned short seq;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	SEMUNDO_LOCK();
 	LIST_FOREACH(suptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	if (suptr == NULL) {
 		SEMUNDO_UNLOCK();
 		return;
 	}
 	LIST_REMOVE(suptr, un_next);
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		SEMUNDO_UNLOCK();
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			semid = suptr->un_ent[ix].un_id;
 			semnum = suptr->un_ent[ix].un_num;
 			adjval = suptr->un_ent[ix].un_adjval;
 			seq = suptr->un_ent[ix].un_seq;
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 
 			mtx_lock(sema_mtxp);
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 			    (semakptr->u.sem_perm.seq != seq)) {
 				mtx_unlock(sema_mtxp);
 				continue;
 			}
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.sem_base[semnum].semval));
 
 			if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
 			    -adjval)
 				semakptr->u.sem_base[semnum].semval = 0;
 			else
 				semakptr->u.sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 		}
 		SEMUNDO_LOCK();
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	suptr->un_cnt = 0;
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	SEMUNDO_UNLOCK();
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, sema,
 	    sizeof(struct semid_kernel) * seminfo.semmni));
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
 	(sy_call_t *)sys_semop
 };
 
 /*
  * Entry point for all SEM calls.
  */
 int
 sys_semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7___semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun_old *arg;
 };
 #endif
 int
 freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
 {
 	struct semid_ds_old dsold;
 	struct semid_ds dsbuf;
 	union semun_old arg;
 	union semun semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsold, sizeof(dsold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
 		CP(dsold, dsbuf, sem_base);
 		CP(dsold, dsbuf, sem_nsems);
 		CP(dsold, dsbuf, sem_otime);
 		CP(dsold, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsold, sizeof(dsold));
 		ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
 		CP(dsbuf, dsold, sem_base);
 		CP(dsbuf, dsold, sem_nsems);
 		CP(dsbuf, dsold, sem_otime);
 		CP(dsbuf, dsold, sem_ctime);
 		error = copyout(&dsold, arg.buf, sizeof(dsold));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD{4,5,6,7} */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_semctl(td,
 		    (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
 	default:
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_semctl(struct thread *td,
     struct freebsd7_freebsd32_semctl_args *uap)
 {
 	struct semid_ds32_old dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 #endif
 
 int
 freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
 {
 	struct semid_ds32 dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD32 */
Index: stable/10/sys/kern/sysv_shm.c
===================================================================
--- stable/10/sys/kern/sysv_shm.c	(revision 284664)
+++ stable/10/sys/kern/sysv_shm.c	(revision 284665)
@@ -1,1372 +1,1376 @@
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 /*-
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 FEATURE(sysv_shm, "System V shared memory segments support");
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 
 static int shm_last_free, shm_nused, shmalloced;
 vm_size_t shm_committed;
 static struct shmid_kernel	*shmsegs;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
 static int shm_find_segment_by_key(key_t);
 static struct shmid_kernel *shm_find_segment(int, bool);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static void shmrealloc(void);
 static int shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	131072	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	SHMMAX,
 	SHMMIN,
 	SHMMNI,
 	SHMSEG,
 	SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "",
     "Current number of shared memory segments allocated");
 
 static struct sx sysvshmsx;
 #define	SYSVSHM_LOCK()		sx_xlock(&sysvshmsx)
 #define	SYSVSHM_UNLOCK()	sx_xunlock(&sysvshmsx)
 #define	SYSVSHM_ASSERT_LOCKED()	sx_assert(&sysvshmsx, SA_XLOCKED)
 
 static int
 shm_find_segment_by_key(key_t key)
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 /*
  * Finds segment either by shmid if is_shmid is true, or by segnum if
  * is_shmid is false.
  */
 static struct shmid_kernel *
 shm_find_segment(int arg, bool is_shmid)
 {
 	struct shmid_kernel *shmseg;
 	int segnum;
 
 	segnum = is_shmid ? IPCID_TO_IX(arg) : arg;
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
 	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)))
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(struct shmid_kernel *shmseg)
 {
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	vm_object_deallocate(shmseg->object);
 	shmseg->object = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_sysvshm_cleanup(shmseg);
 #endif
 	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
 	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
 	crfree(shmseg->cred);
 	shmseg->cred = NULL;
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if ((--shmseg->u.shm_nattch <= 0) &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
 static int
 kern_shmdt_locked(struct thread *td, const void *shmaddr)
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #ifdef MAC
 	struct shmid_kernel *shmsegptr;
 #endif
 	int error, i;
 
 	SYSVSHM_ASSERT_LOCKED();
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL)
 		return (EINVAL);
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg)
 		return (EINVAL);
 #ifdef MAC
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 	error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
 	if (error != 0)
 		return (error);
 #endif
 	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
 int
 sys_shmdt(struct thread *td, struct shmdt_args *uap)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmdt_locked(td, uap->shmaddr);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr,
     int shmflg)
 {
 	struct proc *p = td->td_proc;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int error, i, rv;
 
 	SYSVSHM_ASSERT_LOCKED();
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
 		    M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		KASSERT(p->p_vmspace->vm_shm == NULL, ("raced"));
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
 	shmseg = shm_find_segment(shmid, true);
 	if (shmseg == NULL)
 		return (EINVAL);
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg)
 		return (EMFILE);
 	size = round_page(shmseg->u.shm_segsz);
 	prot = VM_PROT_READ;
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	if (shmaddr != NULL) {
 		if ((shmflg & SHM_RND) != 0)
 			attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
 		else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0)
 			attach_va = (vm_offset_t)shmaddr;
 		else
 			return (EINVAL);
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		PROC_LOCK(p);
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(p, RLIMIT_DATA));
 		PROC_UNLOCK(p);
 	}
 
 	vm_object_reference(shmseg->object);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object,
 	    0, &attach_va, size, 0, shmaddr != NULL ? VMFS_NO_SPACE :
 	    VMFS_OPTIMAL_SPACE, prot, prot, MAP_INHERIT_SHARE);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->object);
 		return (ENOMEM);
 	}
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 	return (error);
 }
 
 int
 kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmat_locked(td, shmid, shmaddr, shmflg);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
 int
 sys_shmat(struct thread *td, struct shmat_args *uap)
 {
 
 	return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg));
 }
 
 static int
 kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf,
     size_t *bufsz)
 {
 	struct shmid_kernel *shmseg;
 	struct shmid_ds *shmidp;
 	struct shm_info shm_info;
 	int error;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 
 	error = 0;
 	switch (cmd) {
 	/*
 	 * It is possible that kern_shmctl is being called from the Linux ABI
 	 * layer, in which case, we will need to implement IPC_INFO.  It should
 	 * be noted that other shmctl calls will be funneled through here for
 	 * Linix binaries as well.
 	 *
 	 * NB: The Linux ABI layer will convert this data to structure(s) more
 	 * consistent with the Linux ABI.
 	 */
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	case SHM_INFO: {
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz != NULL)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	}
 	}
 	shmseg = shm_find_segment(shmid, cmd != SHM_STAT);
 	if (shmseg == NULL)
 		return (EINVAL);
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		return (error);
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error != 0)
 			return (error);
 		memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
 		if (bufsz != NULL)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT) {
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid,
 			    shmseg->u.shm_perm);
 		}
 		break;
 	case IPC_SET:
 		shmidp = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shmseg->u.shm_perm.uid = shmidp->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmidp->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmidp->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shmseg->u.shm_perm.key = IPC_PRIVATE;
 		shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
 		if (shmseg->u.shm_nattch <= 0) {
 			shm_deallocate_segment(shmseg);
 			shm_last_free = IPCID_TO_IX(shmid);
 		}
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
 int
 sys_shmctl(struct thread *td, struct shmctl_args *uap)
 {
 	int error = 0;
 	struct shmid_ds buf;
 	size_t bufsz;
 	
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 	
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 
 static int
 shmget_existing(struct thread *td, struct shmget_args *uap, int mode,
     int segnum)
 {
 	struct shmid_kernel *shmseg;
 #ifdef MAC
 	int error;
 #endif
 
 	SYSVSHM_ASSERT_LOCKED();
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode)
 {
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 	int i, segnum;
 	size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(uap->size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
+			PROC_UNLOCK(td->td_proc);
+			return (ENOSPC);
+		}
+		if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
+			racct_sub(td->td_proc, RACCT_NSHM, 1);
+			PROC_UNLOCK(td->td_proc);
+			return (ENOMEM);
+		}
 		PROC_UNLOCK(td->td_proc);
-		return (ENOSPC);
 	}
-	if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
-		racct_sub(td->td_proc, RACCT_NSHM, 1);
-		PROC_UNLOCK(td->td_proc);
-		return (ENOMEM);
-	}
-	PROC_UNLOCK(td->td_proc);
 #endif
 
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
 	    0, size, VM_PROT_DEFAULT, 0, cred);
 	if (shm_object == NULL) {
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		racct_sub(td->td_proc, RACCT_NSHM, 1);
-		racct_sub(td->td_proc, RACCT_SHMSIZE, size);
-		PROC_UNLOCK(td->td_proc);
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_sub(td->td_proc, RACCT_NSHM, 1);
+			racct_sub(td->td_proc, RACCT_SHMSIZE, size);
+			PROC_UNLOCK(td->td_proc);
+		}
 #endif
 		return (ENOMEM);
 	}
 	shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shm_object);
 	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shm_object);
 
 	shmseg->object = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_perm.key = uap->key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmseg->cred = crhold(cred);
 	shmseg->u.shm_segsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_sysvshm_create(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
 int
 sys_shmget(struct thread *td, struct shmget_args *uap)
 {
 	int segnum, mode;
 	int error;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	mode = uap->shmflg & ACCESSPERMS;
 	SYSVSHM_LOCK();
 	if (uap->key == IPC_PRIVATE) {
 		error = shmget_allocate_segment(td, uap, mode);
 	} else {
 		segnum = shm_find_segment_by_key(uap->key);
 		if (segnum >= 0)
 			error = shmget_existing(td, uap, mode, segnum);
 		else if ((uap->shmflg & IPC_CREAT) == 0)
 			error = ENOENT;
 		else
 			error = shmget_allocate_segment(td, uap, mode);
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static void
 shmfork_myhook(struct proc *p1, struct proc *p2)
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	SYSVSHM_LOCK();
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1) {
 			KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 &&
 			    IPCID_TO_IX(shmmap_s->shmid) < shmalloced,
 			    ("segnum %d shmalloced %d",
 			    IPCID_TO_IX(shmmap_s->shmid), shmalloced));
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 		}
 	}
 	SYSVSHM_UNLOCK();
 }
 
 static void
 shmexit_myhook(struct vmspace *vm)
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	base = vm->vm_shm;
 	if (base != NULL) {
 		vm->vm_shm = NULL;
 		SYSVSHM_LOCK();
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		SYSVSHM_UNLOCK();
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	struct shmid_kernel *newsegs;
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&shmsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static struct syscall_helper_data shm_syscalls[] = {
 	SYSCALL_INIT_HELPER(shmat),
 	SYSCALL_INIT_HELPER(shmctl),
 	SYSCALL_INIT_HELPER(shmdt),
 	SYSCALL_INIT_HELPER(shmget),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
 #endif
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	SYSCALL_INIT_HELPER(shmsys),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data shm32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(shmat),
 	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
 	SYSCALL32_INIT_HELPER_COMPAT(shmget),
 	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
 	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 shminit(void)
 {
 	int i, error;
 
 #ifndef BURN_BRIDGES
 	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
 		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
 #endif
 	TUNABLE_ULONG_FETCH("kern.ipc.shmall", &shminfo.shmall);
 	if (!TUNABLE_ULONG_FETCH("kern.ipc.shmmax", &shminfo.shmmax)) {
 		/* Initialize shmmax dealing with possible overflow. */
 		for (i = PAGE_SIZE; i > 0; i--) {
 			shminfo.shmmax = shminfo.shmall * i;
 			if (shminfo.shmmax >= shminfo.shmall)
 				break;
 		}
 	}
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
 	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
 
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	sx_init(&sysvshmsx, "sysvshmsx");
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 
 	error = syscall_helper_register(shm_syscalls);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(shm32_syscalls);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 shmunload(void)
 {
 	int i;	
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(shm32_syscalls);
 #endif
 	syscall_helper_unregister(shm_syscalls);
 
 	for (i = 0; i < shmalloced; i++) {
 #ifdef MAC
 		mac_sysvshm_destroy(&shmsegs[i]);
 #endif
 		/*
 		 * Objects might be still mapped into the processes
 		 * address spaces.  Actual free would happen on the
 		 * last mapping destruction.
 		 */
 		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
 			vm_object_deallocate(shmsegs[i].object);
 	}
 	free(shmsegs, M_SHM);
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 	sx_destroy(&sysvshmsx);
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0]));
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm_old shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
 
 static int
 oshmctl(struct thread *td, struct oshmctl_args *uap)
 {
 #ifdef COMPAT_43
 	int error = 0;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	if (uap->cmd != IPC_STAT) {
 		return (freebsd7_shmctl(td,
 		    (struct freebsd7_shmctl_args *)uap));
 	}
 	SYSVSHM_LOCK();
 	shmseg = shm_find_segment(uap->shmid, true);
 	if (shmseg == NULL) {
 		SYSVSHM_UNLOCK();
 		return (EINVAL);
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #endif
 	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
 	outbuf.shm_segsz = shmseg->u.shm_segsz;
 	outbuf.shm_cpid = shmseg->u.shm_cpid;
 	outbuf.shm_lpid = shmseg->u.shm_lpid;
 	outbuf.shm_nattch = shmseg->u.shm_nattch;
 	outbuf.shm_atime = shmseg->u.shm_atime;
 	outbuf.shm_dtime = shmseg->u.shm_dtime;
 	outbuf.shm_ctime = shmseg->u.shm_ctime;
 	outbuf.shm_handle = shmseg->object;
 	SYSVSHM_UNLOCK();
 	error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
 	return (error);
 #else
 	return (EINVAL);
 #endif
 }
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
 	(sy_call_t *)freebsd7_shmctl
 };
 
 #ifndef _SYS_SYSPROTO_H_
 /* XXX actually varargs. */
 struct shmsys_args {
 	int	which;
 	int	a2;
 	int	a3;
 	int	a4;
 };
 #endif
 int
 sys_shmsys(struct thread *td, struct shmsys_args *uap)
 {
 	int error;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
 		return (ENOSYS);
 	if (uap->which < 0 || uap->which >= nitems(shmcalls))
 		return (EINVAL);
 	error = (*shmcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:	{	/* shmat */
 		struct shmat_args ap;
 
 		ap.shmid = uap->a2;
 		ap.shmaddr = PTRIN(uap->a3);
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmat].sy_call(td, &ap));
 	}
 	case 2: {	/* shmdt */
 		struct shmdt_args ap;
 
 		ap.shmaddr = PTRIN(uap->a2);
 		return (sysent[SYS_shmdt].sy_call(td, &ap));
 	}
 	case 3: {	/* shmget */
 		struct shmget_args ap;
 
 		ap.key = uap->a2;
 		ap.size = uap->a3;
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmget].sy_call(td, &ap));
 	}
 	case 4: {	/* shmctl */
 		struct freebsd7_freebsd32_shmctl_args ap;
 
 		ap.shmid = uap->a2;
 		ap.cmd = uap->a3;
 		ap.buf = PTRIN(uap->a4);
 		return (freebsd7_freebsd32_shmctl(td, &ap));
 	}
 	case 1:		/* oshmctl */
 	default:
 		return (EINVAL);
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_shmctl(struct thread *td,
     struct freebsd7_freebsd32_shmctl_args *uap)
 {
 	int error = 0;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32_old shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 	
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		u32.shmid_ds32.shm_internal = 0;
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
 {
 	int error = 0;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32 shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 	
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 	
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds_old *buf;
 };
 #endif
 int
 freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap)
 {
 	int error = 0;
 	struct shmid_ds_old old;
 	struct shmid_ds buf;
 	size_t bufsz;
 	
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &old, sizeof(old))))
 			goto done;
 		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
 		CP(old, buf, shm_segsz);
 		CP(old, buf, shm_lpid);
 		CP(old, buf, shm_cpid);
 		CP(old, buf, shm_nattch);
 		CP(old, buf, shm_atime);
 		CP(old, buf, shm_dtime);
 		CP(old, buf, shm_ctime);
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
 		if (buf.shm_segsz > INT_MAX)
 			old.shm_segsz = INT_MAX;
 		else
 			CP(buf, old, shm_segsz);
 		CP(buf, old, shm_lpid);
 		CP(buf, old, shm_cpid);
 		if (buf.shm_nattch > SHRT_MAX)
 			old.shm_nattch = SHRT_MAX;
 		else
 			CP(buf, old, shm_nattch);
 		CP(buf, old, shm_atime);
 		CP(buf, old, shm_dtime);
 		CP(buf, old, shm_ctime);
 		old.shm_internal = NULL;
 		error = copyout(&old, uap->buf, sizeof(old));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = shminit();
 		if (error != 0)
 			shmunload();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
Index: stable/10/sys/pc98/conf/GENERIC
===================================================================
--- stable/10/sys/pc98/conf/GENERIC	(revision 284664)
+++ stable/10/sys/pc98/conf/GENERIC	(revision 284665)
@@ -1,254 +1,257 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/pc98
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		I486_CPU
 cpu		I586_CPU
 cpu		I686_CPU
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 
 options 	SCHED_4BSD		# 4BSD scheduler
 #options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# New Network Filesystem Client
 options 	NFSD			# New Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	EPSON_BOUNCEDMA		# use bounce buffer for 15-16M
 #options 	EPSON_MEMWIN		# EPSON memory window support
 #options 	LINE30
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	PROCDESC		# Support for process descriptors
 options 	MAC			# TrustedBSD MAC Framework
 options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
+options 	RACCT			# Resource accounting framework
+options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
+options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 
 # To make an SMP kernel, the next two lines are needed
 #options 	SMP			# Symmetric MultiProcessor Kernel
 #device		apic			# I/O APIC
 
 # Bus support.
 device		pci
 
 # Floppy drives
 device		fdc
 
 # ATA controllers
 device		ahci			# AHCI-compatible SATA controllers
 device		ata			# Legacy ATA/SATA controllers
 options 	ATA_STATIC_ID		# Static device numbering
 device		mvs			# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis			# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahc			# AHA2940 and onboard AIC7xxx devices
 device		esp			# AMD Am53C974 (Tekram DC-390(T))
 device		isp			# Qlogic family
 #device		ncr			# NCR/Symbios Logic
 device		sym			# NCR/Symbios Logic (newer chipsets + those of `ncr')
 
 device		adv			# Advansys SCSI adapters
 device		aic			# PC-9801-100
 device		ct			# host adapter using WD33C93[ABC] chip
 
 device		ncv			# NCR 53C500
 device		nsp			# Workbit Ninja SCSI-3
 device		stg			# TMC 18C30/18C50
 
 # ATA/SCSI peripherals
 device		scbus			# SCSI bus (required for ATA/SCSI)
 device		ch			# SCSI media changers
 device		da			# Direct Access (disks)
 device		sa			# Sequential Access (tape etc)
 device		cd			# CD
 device		pass			# Passthrough device (direct ATA/SCSI access)
 device		ses			# Enclosure Services (SES and SAF-TE)
 #device		ctl			# CAM Target Layer
 
 # keyboard driver
 device		pckbd			# PC98 keyboard
 
 device		gdc			# GDC screen
 
 device		splash			# Splash screen and screen saver support
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 
 #device		agp			# support several AGP chipsets
 
 # Power management support (see NOTES for more options)
 #device		apm
 #device		pmc
 #device		canbus
 #device		canbepm
 # Add suspend/resume support for the i8254.
 #device		pmtimer
 
 # PCCARD (PCMCIA) support
 # PCMCIA and cardbus bridge support
 device		cbb			# cardbus (yenta) bridge
 device		pccard			# PC Card (16-bit) bus
 device		cardbus			# CardBus (32-bit) bus
 
 # Serial (COM) ports
 #options 	COM_MULTIPORT
 #options 	COM_ESP			# ESP98
 #device		sio			# 8250, 16[45]50, 8251 based serial ports
 device		uart			# Generic UART driver
 
 device		mse
 #device		joy
 
 # Parallel port
 device		ppc
 device		ppbus			# Parallel port bus (required)
 device		lpt			# Printer
 device		ppi			# Parallel port interface device
 #device		vpo			# Requires scbus and da
 # OLD Parallel port
 #device		olpt
 
 device		puc			# Multi I/O cards and multi-channel UARTs
 
 # PCI Ethernet NICs.
 device		de			# DEC/Intel DC21x4x (``Tulip'')
 #device		em			# Intel PRO/1000 Gigabit Ethernet Family
 device		le			# AMD Am7900 LANCE and Am79C9xx PCnet
 #device		ti			# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp			# 3Com 3cR990 (``Typhoon'')
 device		vx			# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
 device		miibus			# MII bus support
 device		bfe			# Broadcom BCM440x 10/100 Ethernet
 #device		bge			# Broadcom BCM570xx Gigabit Ethernet
 device		dc			# DEC/Intel 21143 and various workalikes
 device		fxp			# Intel EtherExpress PRO/100B (82557, 82558)
 #device		lge			# Level 1 LXT1001 gigabit Ethernet
 #device		nge			# NatSemi DP83820 gigabit Ethernet
 device		pcn			# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re			# RealTek 8139C+/8169/8169S/8110S
 device		rl			# RealTek 8129/8139
 device		sf			# Adaptec AIC-6915 (``Starfire'')
 device		sis			# Silicon Integrated Systems SiS 900/SiS 7016
 #device		sk			# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste			# Sundance ST201 (D-Link DFE-550TX)
 device		tl			# Texas Instruments ThunderLAN
 device		tx			# SMC EtherPower II (83c170 ``EPIC'')
 #device		vge			# VIA VT612x gigabit Ethernet
 device		vr			# VIA Rhine, Rhine II
 device		wb			# Winbond W89C840F
 device		xl			# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # ISA Ethernet NICs.  pccard NICs included.
 # 'device ed' requires 'device miibus'
 device		ed			# NE[12]000, SMC Ultra, 3c503, DS8390 cards
 device		ep			# Etherlink III based cards
 device		fe			# Fujitsu MB8696x based cards
 device		sn			# SMC's 9000 series of Ethernet chips
 device		snc
 device		xe			# Xircom pccard Ethernet
 
 # Wireless NIC cards
 #device		wlan			# 802.11 support
 #options 	IEEE80211_DEBUG		# enable debug msgs
 #options 	IEEE80211_AMPDU_AGE	# age frames in AMPDU reorder q's
 options 	IEEE80211_SUPPORT_MESH	# enable 802.11s draft support
 #device		wlan_wep		# 802.11 WEP support
 #device		wlan_ccmp		# 802.11 CCMP support
 #device		wlan_tkip		# 802.11 TKIP support
 #device		wlan_amrr		# AMRR transmit rate control algorithm
 #device		an			# Aironet 4500/4800 802.11 wireless NICs.
 #device		ath			# Atheros NICs
 #device		ath_pci			# Atheros pci/cardbus glue
 #device		ath_hal			# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
 #device		ath_rate_sample		# SampleRate tx rate control for ath
 #device		ral			# Ralink Technology RT2500 wireless NICs.
 #device		wi			# WaveLAN/Intersil/Symbol 802.11 wireless NICs.
 #device		wl			# Older non 802.11 Wavelan wireless NIC.
 
 # Pseudo devices.
 device		loop			# Network loopback
 device		random			# Entropy device
 device		ether			# Ethernet support
 device		vlan			# 802.1Q VLAN support
 device		tun			# Packet tunnel.
 device		md			# Memory "disks"
 device		gif			# IPv6 and IPv4 tunneling
 device		faith			# IPv6-to-IPv4 relaying (translation)
 device		firmware		# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf			# Berkeley packet filter
 
 # USB support
 #options 	USB_DEBUG		# enable debug msgs
 #device		uhci			# UHCI PCI->USB interface
 #device		ohci			# OHCI PCI->USB interface
 #device		ehci			# EHCI PCI->USB interface (USB 2.0)
 #device		usb			# USB Bus (required)
 #device		ukbd			# Keyboard
 #device		umass			# Disks/Mass storage - Requires scbus and da
 
 # Sound support
 #device		sound			# Generic sound driver (required)
 #device		snd_mss			# Microsoft Sound System
 #device		"snd_sb16"		# Sound Blaster 16
 #device		snd_sbc			# Sound Blaster
Index: stable/10/sys/powerpc/conf/GENERIC
===================================================================
--- stable/10/sys/powerpc/conf/GENERIC	(revision 284664)
+++ stable/10/sys/powerpc/conf/GENERIC	(revision 284665)
@@ -1,209 +1,212 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/powerpc
 #
 # For more information on this file, please read the handbook section on
 # Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files. 
 # If you are in doubt as to the purpose or necessity of a line, check first 
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		AIM
 ident		GENERIC
 
 machine 	powerpc powerpc
 
 makeoptions	DEBUG=-g		#Build kernel with gdb(1) debug symbols
 makeoptions	WITH_CTF=1
 
 # Platform support
 options 	POWERMAC		#NewWorld Apple PowerMacs
 options 	PSIM			#GDB PSIM ppc simulator
 options 	MAMBO			#IBM Mambo Full System Simulator
 options 	PSERIES			#PAPR-compliant systems
 
 options 	SCHED_ULE		#ULE scheduler
 options 	PREEMPTION		#Enable kernel thread preemption
 options 	INET			#InterNETworking
 options 	INET6			#IPv6 communications protocols
 options 	SCTP			#Stream Control Transmission Protocol
 options 	FFS			#Berkeley Fast Filesystem
 options 	SOFTUPDATES		#Enable FFS soft updates support
 options 	UFS_ACL			#Support for access control lists
 options 	UFS_DIRHASH		#Improve performance on big directories
 options 	UFS_GJOURNAL		#Enable gjournal-based UFS journaling
 options 	QUOTA			#Enable disk quotas for UFS
 options 	MD_ROOT			#MD is a potential root device
 options 	NFSCL			#New Network Filesystem Client
 options 	NFSD			#New Network Filesystem Server
 options 	NFSLOCKD		#Network Lock Manager
 options 	NFS_ROOT		#NFS usable as root device
 options 	MSDOSFS			#MSDOS Filesystem
 options 	CD9660			#ISO 9660 Filesystem
 options 	PROCFS			#Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		#Pseudo-filesystem framework
 options 	GEOM_PART_APM		#Apple Partition Maps.
 options 	GEOM_PART_GPT		#GUID Partition Tables.
 options 	GEOM_LABEL		#Provides labelization
 options 	COMPAT_FREEBSD4		#Keep this for a while
 options 	COMPAT_FREEBSD5		#Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		#Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		#Compatible with FreeBSD7
 options 	SCSI_DELAY=5000		#Delay (in ms) before probing SCSI 
 options 	KTRACE			#ktrace(1) syscall trace support
 options 	STACK			#stack(9) support
 options 	SYSVSHM			#SYSV-style shared memory
 options 	SYSVMSG			#SYSV-style message queues
 options 	SYSVSEM			#SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	PROCDESC		# Support for process descriptors
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
+options 	RACCT			# Resource accounting framework
+options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
+options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 
 # CPU frequency control
 device		cpufreq
 
 # Standard busses
 device		pci
 device		agp
 
 # ATA controllers
 device		ahci		# AHCI-compatible SATA controllers
 device		ata		# Legacy ATA/SATA controllers
 device		mvs		# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis		# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahc		# AHA2940 and onboard AIC7xxx devices
 options 	AHC_ALLOW_MEMIO	# Attempt to use memory mapped I/O
 options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~128k to driver.
 device		isp		# Qlogic family
 device		ispfw		# Firmware module for Qlogic host adapters
 device		mpt		# LSI-Logic MPT-Fusion
 device		mps		# LSI-Logic MPT-Fusion 2
 device		sym		# NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D
 
 # ATA/SCSI peripherals
 device		scbus		# SCSI bus (required for ATA/SCSI)
 device		da		# Direct Access (disks)
 device		sa		# Sequential Access (tape etc)
 device		cd		# CD
 device		pass		# Passthrough device (direct ATA/SCSI access)
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 device		kbdmux
 options 	SC_OFWFB	# OFW frame buffer
 options 	SC_DFLT_FONT	# compile font in
 makeoptions	SC_DFLT_FONT=cp437
 
 # Serial (COM) ports
 device		scc
 device		uart
 device		uart_z8530
 
 # FireWire support
 device		firewire	# FireWire bus code
 device		sbp		# SCSI over FireWire (Requires scbus and da)
 device		fwe		# Ethernet over FireWire (non-standard!)
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 device		miibus		# MII bus support
 device		bge		# Broadcom BCM570xx Gigabit Ethernet
 device		bm		# Apple BMAC Ethernet
 device		gem		# Sun GEM/Sun ERI/Apple GMAC
 device		dc		# DEC/Intel 21143 and various workalikes
 device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
 
 # Pseudo devices.
 device		loop		# Network loopback
 device		random		# Entropy device
 device		ether		# Ethernet support
 device		vlan		# 802.1Q VLAN support
 device		tun		# Packet tunnel.
 device		md		# Memory "disks"
 device		ofwd		# Open Firmware disks
 device		gif		# IPv6 and IPv4 tunneling
 device		faith		# IPv6-to-IPv4 relaying/(translation)
 device		firmware	# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf		#Berkeley packet filter
 
 # USB support
 options 	USB_DEBUG	# enable debug msgs
 device		uhci		# UHCI PCI->USB interface
 device		ohci		# OHCI PCI->USB interface
 device		ehci		# EHCI PCI->USB interface
 device		usb		# USB Bus (required)
 device		uhid		# "Human Interface Devices"
 device		ukbd		# Keyboard
 options 	KBD_INSTALL_CDEV # install a CDEV entry in /dev
 device		ulpt		# Printer
 device		umass		# Disks/Mass storage - Requires scbus and da0
 device		ums		# Mouse
 device		atp		# Apple USB touchpad
 device		urio		# Diamond Rio 500 MP3 player
 # USB Ethernet
 device		aue		# ADMtek USB Ethernet
 device		axe		# ASIX Electronics USB Ethernet
 device		cdce		# Generic USB over Ethernet
 device		cue		# CATC USB Ethernet
 device		kue		# Kawasaki LSI USB Ethernet
 
 # Wireless NIC cards
 options 	IEEE80211_SUPPORT_MESH
 options 	AH_SUPPORT_AR5416
 
 # Misc
 device		iicbus		# I2C bus code
 device		kiic		# Keywest I2C
 device		ad7417		# PowerMac7,2 temperature sensor
 device		adt746x		# PowerBook5,8 temperature sensor
 device		ds1631		# PowerMac11,2 temperature sensor
 device		ds1775		# PowerMac7,2 temperature sensor
 device		fcu		# Apple Fan Control Unit
 device		max6690		# PowerMac7,2 temperature sensor
 device		powermac_nvram	# Open Firmware configuration NVRAM
 device		smu		# Apple System Management Unit
 device		windtunnel	# Apple G4 MDD fan controller
 device		atibl		# ATI-based backlight driver for PowerBooks/iBooks
 device		nvbl		# nVidia-based backlight driver for PowerBooks/iBooks
 
 # ADB support
 device		adb
 device		cuda
 device		pmu
 
 # Sound support
 device		sound		# Generic sound driver (required)
 device		snd_ai2s	# Apple I2S audio
 device		snd_davbus	# Apple DAVBUS audio
 device		snd_uaudio	# USB Audio
 
Index: stable/10/sys/sparc64/conf/GENERIC
===================================================================
--- stable/10/sys/sparc64/conf/GENERIC	(revision 284664)
+++ stable/10/sys/sparc64/conf/GENERIC	(revision 284665)
@@ -1,236 +1,239 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/sparc64
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		SUN4U
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 
 # Platforms supported
 #	At this time all platforms are supported, as-is.
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# New Network Filesystem Client
 options 	NFSD			# New Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 #options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
 options 	COMPAT_FREEBSD6		# Compatible with FreeBSD6
 options 	COMPAT_FREEBSD7		# Compatible with FreeBSD7
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	PROCDESC		# Support for process descriptors
 options 	MAC			# TrustedBSD MAC Framework
 options 	INCLUDE_CONFIG_FILE	# Include this file in kernel
+options 	RACCT			# Resource accounting framework
+options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
+options 	RCTL			# Resource limits
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 
 # Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 
 # Standard busses
 device		ebus
 device		isa
 device		pci
 device		sbus
 device		central
 device		fhc
 
 # Floppy drives
 #device		fdc
 
 # ATA controllers
 device		ahci		# AHCI-compatible SATA controllers
 device		ata		# Legacy ATA/SATA controllers
 device		mvs		# Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA
 device		siis		# SiliconImage SiI3124/SiI3132/SiI3531 SATA
 
 # SCSI Controllers
 device		ahc		# AHA2940 and onboard AIC7xxx devices
 options 	AHC_ALLOW_MEMIO	# Attempt to use memory mapped I/O
 options 	AHC_REG_PRETTY_PRINT	# Print register bitfields in debug
 					# output.  Adds ~128k to driver.
 device		esp		# AMD Am53C974, Sun ESP and FAS	families
 device		isp		# Qlogic family
 device		ispfw		# Firmware module for Qlogic host adapters
 device		mpt		# LSI-Logic MPT-Fusion
 device		mps		# LSI-Logic MPT-Fusion 2
 device		mpr		# LSI-Logic MPT-Fusion 3
 device		sym		# NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D
 
 # ATA/SCSI peripherals
 device		scbus		# SCSI bus (required for ATA/SCSI)
 device		ch		# SCSI media changers
 device		da		# Direct Access (disks)
 device		sa		# Sequential Access (tape etc)
 device		cd		# CD
 device		pass		# Passthrough device (direct ATA/SCSI access)
 device		ses		# Enclosure Services (SES and SAF-TE)
 #device		ctl		# CAM Target Layer
 
 # RAID controllers
 #device		amr		# AMI MegaRAID
 #device		mlx		# Mylex DAC960 family
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc		# AT keyboard controller
 device		atkbd		# AT keyboard
 device		psm		# PS/2 mouse
 
 device		kbdmux		# keyboard multiplexer
 
 # syscons is the default console driver, resembling an SCO console
 device		sc
 device		creator		# Creator, Creator3D and Elite3D framebuffers
 device		machfb		# ATI Mach64 framebuffers
 device		splash		# Splash screen and screen saver support
 options 	KBD_INSTALL_CDEV # install a CDEV entry in /dev
 
 # Builtin hardware
 device		auxio		# auxiliary I/O device
 device		eeprom		# eeprom (really a front-end for the MK48Txx)
 device		mk48txx		# Mostek MK48Txx clocks
 device		rtc		# rtc (really a front-end for the MC146818)
 device		mc146818	# Motorola MC146818 and compatible clocks
 device		epic		# Sun Fire V215/V245 LEDs
 device		sbbc		# Sun BootBus controller (time-of-day clock for
 				# Serengeti and StarCat, console for Serengeti,
 				# requires device uart)
 
 # Serial (COM) ports
 device		puc		# Multi-channel uarts
 device		scc		# Serial communications controllers.
 device		uart		# Multi-uart driver
 
 # Parallel port
 #device		ppc
 #device		ppbus		# Parallel port bus (required)
 #device		lpt		# Printer
 #device		ppi		# Parallel port interface device
 #device		vpo		# Requires scbus and da
 
 # PCI Ethernet NICs.
 #device		de		# DEC/Intel DC21x4x (``Tulip'')
 device		em		# Intel PRO/1000 adapter Gigabit Ethernet Card
 #device		ixgb		# Intel PRO/10GbE Ethernet Card
 device		le		# AMD Am7900 LANCE and Am79C9xx PCnet
 device		ti		# Alteon Networks Tigon I/II gigabit Ethernet
 device		txp		# 3Com 3cR990 (``Typhoon'')
 #device		vx		# 3Com 3c590, 3c595 (``Vortex'')
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
 device		miibus		# MII bus support
 #device		bfe		# Broadcom BCM440x 10/100 Ethernet
 device		bge		# Broadcom BCM570xx Gigabit Ethernet
 device		cas		# Sun Cassini/Cassini+ and NS DP83065 Saturn
 device		dc		# DEC/Intel 21143 and various workalikes
 device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
 device		gem		# Sun GEM/Sun ERI/Apple GMAC
 device		hme		# Sun HME (Happy Meal Ethernet)
 device		nge		# NatSemi DP83820 gigabit Ethernet
 #device		pcn		# AMD Am79C97x PCI 10/100 (precedence over 'le')
 device		re		# RealTek 8139C+/8169/8169S/8110S
 device		rl		# RealTek 8129/8139
 device		sf		# Adaptec AIC-6915 (``Starfire'')
 device		sis		# Silicon Integrated Systems SiS 900/SiS 7016
 device		sk		# SysKonnect SK-984x & SK-982x gigabit Ethernet
 device		ste		# Sundance ST201 (D-Link DFE-550TX)
 device		stge		# Sundance/Tamarack TC9021 gigabit Ethernet
 #device		tl		# Texas Instruments ThunderLAN
 #device		tx		# SMC EtherPower II (83c170 ``EPIC'')
 device		vr		# VIA Rhine, Rhine II
 #device		wb		# Winbond W89C840F
 device		xl		# 3Com 3c90x (``Boomerang'', ``Cyclone'')
 
 # Wireless NIC cards
 device		wlan		# 802.11 support
 options 	IEEE80211_DEBUG	# enable debug msgs
 options 	IEEE80211_AMPDU_AGE	# age frames in AMPDU reorder q's
 options 	IEEE80211_SUPPORT_MESH	# enable 802.11s D3.0 support
 device		wlan_wep	# 802.11 WEP support
 device		wlan_ccmp	# 802.11 CCMP support
 device		wlan_tkip	# 802.11 TKIP support
 device		wlan_amrr	# AMRR transmit rate control algorithm
 device		ath		# Atheros NICs
 device		ath_pci		# Atheros pci/cardbus glue
 device		ath_hal		# Atheros HAL (Hardware Access Layer)
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
 device		ath_rate_sample	# SampleRate tx rate control for ath
 
 # Pseudo devices.
 device		loop		# Network loopback
 device		random		# Entropy device
 device		ether		# Ethernet support
 device		vlan		# 802.1Q VLAN support
 device		tun		# Packet tunnel.
 device		md		# Memory "disks"
 device		gif		# IPv6 and IPv4 tunneling
 device		faith		# IPv6-to-IPv4 relaying (translation)
 device		firmware	# firmware assist module
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf		# Berkeley packet filter
 
 # USB support
 options 	USB_DEBUG	# enable debug msgs
 device		uhci		# UHCI PCI->USB interface
 device		ohci		# OHCI PCI->USB interface
 device		ehci		# EHCI PCI->USB interface (USB 2.0)
 device		usb		# USB Bus (required)
 device		ukbd		# Keyboard
 device		umass		# Disks/Mass storage - Requires scbus and da
 
 # Sound support
 device		sound		# Generic sound driver (required)
 device		snd_audiocs	# Crystal Semiconductor CS4231
 device		snd_es137x	# Ensoniq AudioPCI ES137x
 device		snd_t4dwave	# Acer Labs M5451
Index: stable/10/sys/sys/racct.h
===================================================================
--- stable/10/sys/sys/racct.h	(revision 284664)
+++ stable/10/sys/sys/racct.h	(revision 284665)
@@ -1,165 +1,169 @@
 /*-
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Resource accounting.
  */
 
 #ifndef _RACCT_H_
 #define	_RACCT_H_
 
 #include <sys/cdefs.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 
 struct proc;
 struct rctl_rule_link;
 struct ucred;
 
 /*
  * Resources.
  */
 #define	RACCT_UNDEFINED		-1
 #define	RACCT_CPU		0
 #define	RACCT_DATA		1
 #define	RACCT_STACK		2
 #define	RACCT_CORE		3
 #define	RACCT_RSS		4
 #define	RACCT_MEMLOCK		5
 #define	RACCT_NPROC		6
 #define	RACCT_NOFILE		7
 #define	RACCT_VMEM		8
 #define	RACCT_NPTS		9
 #define	RACCT_SWAP		10
 #define	RACCT_NTHR		11
 #define	RACCT_MSGQQUEUED	12
 #define	RACCT_MSGQSIZE		13
 #define	RACCT_NMSGQ		14
 #define	RACCT_NSEM		15
 #define	RACCT_NSEMOP		16
 #define	RACCT_NSHM		17
 #define	RACCT_SHMSIZE		18
 #define	RACCT_WALLCLOCK		19
 #define	RACCT_PCTCPU		20
 #define	RACCT_MAX		RACCT_PCTCPU
 
 /*
  * Resource properties.
  */
 #define	RACCT_IN_MILLIONS	0x01
 #define	RACCT_RECLAIMABLE	0x02
 #define	RACCT_INHERITABLE	0x04
 #define	RACCT_DENIABLE		0x08
 #define	RACCT_SLOPPY		0x10
 #define	RACCT_DECAYING		0x20
 
 extern int racct_types[];
+extern int racct_enable;
+
+#define ASSERT_RACCT_ENABLED()	KASSERT(racct_enable, \
+				    ("%s called with !racct_enable", __func__))
 
 /*
  * Amount stored in c_resources[] is 10**6 times bigger than what's
  * visible to the userland.  It gets fixed up when retrieving resource
  * usage or adding rules.
  */
 #define	RACCT_IS_IN_MILLIONS(X)	(racct_types[X] & RACCT_IN_MILLIONS)
 
 /*
  * Resource usage can drop, as opposed to only grow.  When the process
  * terminates, its resource usage is freed from the respective
  * per-credential racct containers.
  */
 #define	RACCT_IS_RECLAIMABLE(X)	(racct_types[X] & RACCT_RECLAIMABLE)
 
 /*
  * Children inherit resource usage.
  */
 #define	RACCT_IS_INHERITABLE(X)	(racct_types[X] & RACCT_INHERITABLE)
 
 /*
  * racct_{add,set}(9) can actually return an error and not update resource
  * usage counters.  Note that even when resource is not deniable, allocating
  * resource might cause signals to be sent by RCTL code.
  */
 #define	RACCT_IS_DENIABLE(X)		(racct_types[X] & RACCT_DENIABLE)
 
 /*
  * Per-process resource usage information makes no sense, but per-credential
  * one does.  This kind of resources are usually allocated for process, but
  * freed using credentials.
  */
 #define	RACCT_IS_SLOPPY(X)		(racct_types[X] & RACCT_SLOPPY)
 
 /*
  * When a process terminates, its resource usage is not automatically
  * subtracted from per-credential racct containers.  Instead, the resource
  * usage of per-credential racct containers decays in time.
  * Resource usage can olso drop for such resource.
  * So far, the only such resource is RACCT_PCTCPU.
  */
 #define RACCT_IS_DECAYING(X)		(racct_types[X] & RACCT_DECAYING)
 
 /*
  * Resource usage can drop, as opposed to only grow.
  */
 #define RACCT_CAN_DROP(X)		(RACCT_IS_RECLAIMABLE(X) | RACCT_IS_DECAYING(X))
 
 /*
  * The 'racct' structure defines resource consumption for a particular
  * subject, such as process or jail.
  *
  * This structure must be filled with zeroes initially.
  */
 struct racct {
 	int64_t				r_resources[RACCT_MAX + 1];
 	LIST_HEAD(, rctl_rule_link)	r_rule_links;
 };
 
 int	racct_add(struct proc *p, int resource, uint64_t amount);
 void	racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
 void	racct_add_force(struct proc *p, int resource, uint64_t amount);
 int	racct_set(struct proc *p, int resource, uint64_t amount);
 void	racct_set_force(struct proc *p, int resource, uint64_t amount);
 void	racct_sub(struct proc *p, int resource, uint64_t amount);
 void	racct_sub_cred(struct ucred *cred, int resource, uint64_t amount);
 uint64_t	racct_get_limit(struct proc *p, int resource);
 uint64_t	racct_get_available(struct proc *p, int resource);
 
 void	racct_create(struct racct **racctp);
 void	racct_destroy(struct racct **racctp);
 
 int	racct_proc_fork(struct proc *parent, struct proc *child);
 void	racct_proc_fork_done(struct proc *child);
 void	racct_proc_exit(struct proc *p);
 
 void	racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
 	    struct ucred *newcred);
 void	racct_move(struct racct *dest, struct racct *src);
 
 #endif /* !_RACCT_H_ */
Index: stable/10/sys/vm/swap_pager.c
===================================================================
--- stable/10/sys/vm/swap_pager.c	(revision 284664)
+++ stable/10/sys/vm/swap_pager.c	(revision 284665)
@@ -1,2803 +1,2805 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_swap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, 16
  * or 32 pages per allocation.
  * The 32-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 /*
  * The swblock structure maps an object and a small, fixed-size range
  * of page indices to disk addresses within a swap area.
  * The collection of these mappings is implemented as a hash table.
  * Unused disk addresses within a swap area are allocated and managed
  * using a blist.
  */
 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
 struct swblock {
 	struct swblock	*swb_hnext;
 	vm_object_t	swb_object;
 	vm_pindex_t	swb_index;
 	int		swb_count;
 	daddr_t		swb_pages[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
 
 static vm_ooffset_t swap_total;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
     "Total amount of available swap storage.");
 static vm_ooffset_t swap_reserved;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
     "Amount of swap storage needed to back all allocated anonymous memory.");
 static int overcommit = 0;
 SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
     "Actual size of swap metadata zone");
 static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
     "Maximum amount of swap supported");
 
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
 #define	SWAP_RESERVE_ALLOW_NONWIRED	(1 << 2)
 
 int
 swap_reserve(vm_ooffset_t incr)
 {
 
 	return (swap_reserve_by_cred(incr, curthread->td_ucred));
 }
 
 int
 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
 {
 	vm_ooffset_t r, s;
 	int res, error;
 	static int curfail;
 	static struct timeval lastfail;
 	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	if (incr & PAGE_MASK)
 		panic("swap_reserve: & PAGE_MASK");
 
 #ifdef RACCT
-	PROC_LOCK(curproc);
-	error = racct_add(curproc, RACCT_SWAP, incr);
-	PROC_UNLOCK(curproc);
-	if (error != 0)
-		return (0);
+	if (racct_enable) {
+		PROC_LOCK(curproc);
+		error = racct_add(curproc, RACCT_SWAP, incr);
+		PROC_UNLOCK(curproc);
+		if (error != 0)
+			return (0);
+	}
 #endif
 
 	res = 0;
 	mtx_lock(&sw_dev_mtx);
 	r = swap_reserved + incr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
 		s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
 		s *= PAGE_SIZE;
 	} else
 		s = 0;
 	s += swap_total;
 	if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
 	    (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
 		res = 1;
 		swap_reserved = r;
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	if (res) {
 		PROC_LOCK(curproc);
 		UIDINFO_VMSIZE_LOCK(uip);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
 		    uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
 			res = 0;
 		else
 			uip->ui_vmsize += incr;
 		UIDINFO_VMSIZE_UNLOCK(uip);
 		PROC_UNLOCK(curproc);
 		if (!res) {
 			mtx_lock(&sw_dev_mtx);
 			swap_reserved -= incr;
 			mtx_unlock(&sw_dev_mtx);
 		}
 	}
 	if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
 		    uip->ui_uid, curproc->p_pid, incr);
 	}
 
 #ifdef RACCT
 	if (!res) {
 		PROC_LOCK(curproc);
 		racct_sub(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 
 	return (res);
 }
 
 void
 swap_reserve_force(vm_ooffset_t incr)
 {
 	struct uidinfo *uip;
 
 	mtx_lock(&sw_dev_mtx);
 	swap_reserved += incr;
 	mtx_unlock(&sw_dev_mtx);
 
 #ifdef RACCT
 	PROC_LOCK(curproc);
 	racct_add_force(curproc, RACCT_SWAP, incr);
 	PROC_UNLOCK(curproc);
 #endif
 
 	uip = curthread->td_ucred->cr_ruidinfo;
 	PROC_LOCK(curproc);
 	UIDINFO_VMSIZE_LOCK(uip);
 	uip->ui_vmsize += incr;
 	UIDINFO_VMSIZE_UNLOCK(uip);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release(vm_ooffset_t decr)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(curproc);
 	cred = curthread->td_ucred;
 	swap_release_by_cred(decr, cred);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
  	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	if (decr & PAGE_MASK)
 		panic("swap_release: & PAGE_MASK");
 
 	mtx_lock(&sw_dev_mtx);
 	if (swap_reserved < decr)
 		panic("swap_reserved < decr");
 	swap_reserved -= decr;
 	mtx_unlock(&sw_dev_mtx);
 
 	UIDINFO_VMSIZE_LOCK(uip);
 	if (uip->ui_vmsize < decr)
 		printf("negative vmsize for uid = %d\n", uip->ui_uid);
 	uip->ui_vmsize -= decr;
 	UIDINFO_VMSIZE_UNLOCK(uip);
 
 	racct_sub_cred(cred, RACCT_SWAP, decr);
 }
 
 static void swapdev_strategy(struct buf *, struct swdevt *sw);
 
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
 int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static struct swblock **swhash;
 static int swhash_mask;
 static struct mtx swhash_mtx;
 
 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
 	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct mtx sw_alloc_mtx;	/* protect list manipulation */
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t	swap_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * dmmax is in page-sized chunks with the new swap system.  It was
  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int dmmax;
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax,
 	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static int	swapongeom(struct thread *, struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, int npages);
 static daddr_t	swp_pager_getswapspace(int npages);
 
 /*
  * Metadata functions
  */
 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 static void
 swp_pager_free_nrpage(vm_page_t m)
 {
 
 	vm_page_lock(m);
 	if (m->wire_count == 0)
 		vm_page_free(m);
 	vm_page_unlock(m);
 }
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWP_PAGER_HASH() -	hash swap meta data
  *
  *	This is an helper function which hashes the swapblk given
  *	the object and page index.  It returns a pointer to a pointer
  *	to the object, or a pointer to a NULL pointer if it could not
  *	find a swapblk.
  */
 static struct swblock **
 swp_pager_hash(vm_object_t object, vm_pindex_t index)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 
 	index &= ~(vm_pindex_t)SWAP_META_MASK;
 	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
 	while ((swap = *pswap) != NULL) {
 		if (swap->swb_object == object &&
 		    swap->swb_index == index
 		) {
 			break;
 		}
 		pswap = &swap->swb_hnext;
 	}
 	return (pswap);
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 
 	/*
 	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
 	dmmax = SWB_NPAGES * 2;
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	unsigned long n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
 	mtx_lock(&pbuf_mtx);
 	nsw_rcount = (nswbuf + 1) / 2;
 	nsw_wcount_sync = (nswbuf + 3) / 4;
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Initialize our zone.  Right now I'm just guessing on the number
 	 * we need based on the number of pages in the system.  Each swblock
 	 * can hold 32 pages, so this is probably overkill.  This reservation
 	 * is typically limited to around 32MB by default.
 	 */
 	n = cnt.v_page_count / 2;
 	if (maxswzone && n > maxswzone / sizeof(struct swblock))
 		n = maxswzone / sizeof(struct swblock);
 	n2 = n;
 	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	if (swap_zone == NULL)
 		panic("failed to create swap_zone.");
 	do {
 		if (uma_zone_reserve_kva(swap_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 	if (n2 != n)
 		printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
 	swap_maxpages = n * SWAP_META_PAGES;
 	swzone = n * sizeof(struct swblock);
 	n2 = n;
 
 	/*
 	 * Initialize our meta-data hash table.  The swapper does not need to
 	 * be quite as efficient as the VM system, so we do not use an
 	 * oversized hash table.
 	 *
 	 * 	n: 		size of hash table, must be power of 2
 	 *	swhash_mask:	hash table index mask
 	 */
 	for (n = 1; n < n2 / 8; n *= 2)
 		;
 	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
 	swhash_mask = n - 1;
 	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  *	and then converting it with swp_pager_meta_build().
  *
  *	This routine may block in vm_object_allocate() and create a named
  *	object lookup race, so we must interlock.
  *
  * MPSAFE
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
 	if (handle) {
 		mtx_lock(&Giant);
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
 			if (cred != NULL) {
 				if (!swap_reserve_by_cred(size, cred)) {
 					sx_xunlock(&sw_alloc_sx);
 					mtx_unlock(&Giant);
 					return (NULL);
 				}
 				crhold(cred);
 			}
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
 			VM_OBJECT_WLOCK(object);
 			object->handle = handle;
 			if (cred != NULL) {
 				object->cred = cred;
 				object->charge = size;
 			}
 			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
 		mtx_unlock(&Giant);
 	} else {
 		if (cred != NULL) {
 			if (!swap_reserve_by_cred(size, cred))
 				return (NULL);
 			crhold(cred);
 		}
 		object = vm_object_allocate(OBJT_DEFAULT, pindex);
 		VM_OBJECT_WLOCK(object);
 		if (cred != NULL) {
 			object->cred = cred;
 			object->charge = size;
 		}
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		VM_OBJECT_WUNLOCK(object);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	The object must be locked.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		mtx_lock(&sw_alloc_mtx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 		mtx_unlock(&sw_alloc_mtx);
 	}
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for the requested number of pages.  The starting
  *	swap block number (a page index) is returned or SWAPBLK_NONE
  *	if the allocation failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	This routine may not sleep.
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int npages)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int i;
 
 	blk = SWAPBLK_NONE;
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	for (i = 0; i < nswapdev; i++) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if (!(sp->sw_flags & SW_CLOSING)) {
 			blk = blist_alloc(sp->sw_blist, npages);
 			if (blk != SWAPBLK_NONE) {
 				blk += sp->sw_first;
 				sp->sw_used += npages;
 				swap_pager_avail -= npages;
 				swp_sizecheck();
 				swdevhd = TAILQ_NEXT(sp, sw_list);
 				goto done;
 			}
 		}
 		sp = TAILQ_NEXT(sp, sw_list);
 	}
 	if (swap_pager_full != 2) {
 		printf("swap_pager_getswapspace(%d): failed\n", npages);
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	swdevhd = NULL;
 done:
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static int
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
 				bp->b_kvaalloc = bp->b_data;
 				bp->b_data = unmapped_buf;
 				bp->b_kvabase = unmapped_buf;
 				bp->b_offset = 0;
 				bp->b_flags |= B_UNMAPPED;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
 			}
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, int npages)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (blk >= sp->sw_first && blk < sp->sw_end) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed
  *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
  *
  *	The object must be locked.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The
  *	swap blocks are not zeroed.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	int n = 0;
 	daddr_t blk = SWAPBLK_NONE;
 	vm_pindex_t beg = start;	/* save start index */
 
 	VM_OBJECT_WLOCK(object);
 	while (size) {
 		if (n == 0) {
 			n = BLIST_MAX_ALLOC;
 			while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
 				n >>= 1;
 				if (n == 0) {
 					swp_pager_meta_free(object, beg, start - beg);
 					VM_OBJECT_WUNLOCK(object);
 					return (-1);
 				}
 			}
 		}
 		swp_pager_meta_build(object, start, blk);
 		--size;
 		++start;
 		++blk;
 		--n;
 	}
 	swp_pager_meta_free(object, start, n);
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source.
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked.
  *	Both object locks may temporarily be released.
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 
 	VM_OBJECT_ASSERT_WLOCKED(srcobject);
 	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	/*
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
 	if (destroysource) {
 		if (srcobject->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_REMOVE(
 			    NOBJLIST(srcobject->handle),
 			    srcobject,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * transfer source to destination.
 	 */
 	for (i = 0; i < dstobject->size; ++i) {
 		daddr_t dstaddr;
 
 		/*
 		 * Locate (without changing) the swapblk on the destination,
 		 * unless it is invalid in which case free it silently, or
 		 * if the destination is a resident page, in which case the
 		 * source is thrown away.
 		 */
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 
 		if (dstaddr == SWAPBLK_NONE) {
 			/*
 			 * Destination has no swapblk and is not resident,
 			 * copy source.
 			 */
 			daddr_t srcaddr;
 
 			srcaddr = swp_pager_meta_ctl(
 			    srcobject,
 			    i + offset,
 			    SWM_POP
 			);
 
 			if (srcaddr != SWAPBLK_NONE) {
 				/*
 				 * swp_pager_meta_build() can sleep.
 				 */
 				vm_object_pip_add(srcobject, 1);
 				VM_OBJECT_WUNLOCK(srcobject);
 				vm_object_pip_add(dstobject, 1);
 				swp_pager_meta_build(dstobject, i, srcaddr);
 				vm_object_pip_wakeup(dstobject);
 				VM_OBJECT_WLOCK(srcobject);
 				vm_object_pip_wakeup(srcobject);
 			}
 		} else {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the sourceblock.
 			 */
 
 			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page within a reasonable
  *	distance.  We do not try to restrict it to the swap device stripe
  *	(that is handled in getpages/putpages).  It probably isn't worth
  *	doing here.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
 {
 	daddr_t blk0;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = (i - 1);
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = (i - 1);
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not sleep.
  *
  *	The object containing the page must be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 
 	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  *
  *	Attempt to retrieve (m, count) pages from backing store, but make
  *	sure we retrieve at least m[reqpage].  We try to load in as large
  *	a chunk surrounding m[reqpage] as is contiguous in swap and which
  *	belongs to the same object.
  *
  *	The code is designed for asynchronous operation and
  *	immediate-notification of 'reqpage' but tends not to be
  *	used that way.  Please do not optimize-out this algorithmic
  *	feature, I intend to improve on it in the future.
  *
  *	The parent has a single vm_object_pip_add() reference prior to
  *	calling us and we should return with the same.
  *
  *	The parent has BUSY'd the pages.  We should return with 'm'
  *	left busy, but the others adjusted.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
 {
 	struct buf *bp;
 	vm_page_t mreq;
 	int i;
 	int j;
 	daddr_t blk;
 
 	mreq = m[reqpage];
 
 	KASSERT(mreq->object == object,
 	    ("swap_pager_getpages: object mismatch %p/%p",
 	    object, mreq->object));
 
 	/*
 	 * Calculate range to retrieve.  The pages have already been assigned
 	 * their swapblks.  We require a *contiguous* range but we know it to
 	 * not span devices.   If we do not supply it, bad things
 	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
 	 * loops are set up such that the case(s) are handled implicitly.
 	 *
 	 * The swp_*() calls must be made with the object locked.
 	 */
 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 
 	for (i = reqpage - 1; i >= 0; --i) {
 		daddr_t iblk;
 
 		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 		if (blk != iblk + (reqpage - i))
 			break;
 	}
 	++i;
 
 	for (j = reqpage + 1; j < count; ++j) {
 		daddr_t jblk;
 
 		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 		if (blk != jblk - (j - reqpage))
 			break;
 	}
 
 	/*
 	 * free pages outside our collection range.   Note: we never free
 	 * mreq, it must remain busy throughout.
 	 */
 	if (0 < i || j < count) {
 		int k;
 
 		for (k = 0; k < i; ++k)
 			swp_pager_free_nrpage(m[k]);
 		for (k = j; k < count; ++k)
 			swp_pager_free_nrpage(m[k]);
 	}
 
 	/*
 	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq
 	 * still busy, but the others unbusied.
 	 */
 	if (blk == SWAPBLK_NONE)
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Getpbuf() can sleep.
 	 */
 	VM_OBJECT_WUNLOCK(object);
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
 	bp = getpbuf(&nsw_rcount);
 	bp->b_flags |= B_PAGING;
 
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk - (reqpage - i);
 	bp->b_bcount = PAGE_SIZE * (j - i);
 	bp->b_bufsize = PAGE_SIZE * (j - i);
 	bp->b_pager.pg_reqpage = reqpage - i;
 
 	VM_OBJECT_WLOCK(object);
 	{
 		int k;
 
 		for (k = i; k < j; ++k) {
 			bp->b_pages[k - i] = m[k];
 			m[k]->oflags |= VPO_SWAPINPROG;
 		}
 	}
 	bp->b_npages = j - i;
 
 	PCPU_INC(cnt.v_swapin);
 	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
 
 	/*
 	 * We still hold the lock on mreq, and our automatic completion routine
 	 * does not remove it.
 	 */
 	vm_object_pip_add(object, bp->b_npages);
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our m[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
 	VM_OBJECT_WLOCK(object);
 	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
 		mreq->oflags |= VPO_SWAPSLEEP;
 		PCPU_INC(cnt.v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
 		    "swread", hz * 20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * mreq is left busied after completion, but all the other pages
 	 * are freed.  If we had an unrecoverable read error the page will
 	 * not be valid.
 	 */
 	if (mreq->valid != VM_PAGE_BITS_ALL) {
 		return (VM_PAGER_ERROR);
 	} else {
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 /*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new
  *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 void
 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     int flags, int *rtvals)
 {
 	int i, n;
 	boolean_t sync;
 
 	if (count && m[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p",
 		    object,
 		    m[0]->object
 		);
 	}
 
 	/*
 	 * Step 1
 	 *
 	 * Turn object into OBJT_SWAP
 	 * check for bogus sysops
 	 * force sync if not pageout process
 	 */
 	if (object->type != OBJT_SWAP)
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 	VM_OBJECT_WUNLOCK(object);
 
 	n = 0;
 	if (curproc != pageproc)
 		sync = TRUE;
 	else
 		sync = (flags & VM_PAGER_PUT_SYNC) != 0;
 
 	/*
 	 * Step 2
 	 *
 	 * Update nsw parameters from swap_async_max sysctl values.
 	 * Do not let the sysop crash the machine with bogus numbers.
 	 */
 	mtx_lock(&pbuf_mtx);
 	if (swap_async_max != nsw_wcount_async_max) {
 		int n;
 
 		/*
 		 * limit range
 		 */
 		if ((n = swap_async_max) > nswbuf / 2)
 			n = nswbuf / 2;
 		if (n < 1)
 			n = 1;
 		swap_async_max = n;
 
 		/*
 		 * Adjust difference ( if possible ).  If the current async
 		 * count is too low, we may not be able to make the adjustment
 		 * at this time.
 		 */
 		n -= nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		}
 	}
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Step 3
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		int j;
 		struct buf *bp;
 		daddr_t blk;
 
 		/*
 		 * Maximum I/O size is limited by a number of factors.
 		 */
 		n = min(BLIST_MAX_ALLOC, count - i);
 		n = min(n, nsw_cluster_max);
 
 		/*
 		 * Get biggest block of swap we can.  If we fail, fall
 		 * back and try to allocate a smaller block.  Don't go
 		 * overboard trying to allocate space if it would overly
 		 * fragment swap.
 		 */
 		while (
 		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 		    n > 4
 		) {
 			n >>= 1;
 		}
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
 		 * All I/O parameters have been satisfied, build the I/O
 		 * request and assign the swap space.
 		 */
 		if (sync == TRUE) {
 			bp = getpbuf(&nsw_wcount_sync);
 		} else {
 			bp = getpbuf(&nsw_wcount_async);
 			bp->b_flags = B_ASYNC;
 		}
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = m[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object,
 			    mreq->pindex,
 			    blk + j
 			);
 			vm_page_dirty(mreq);
 			rtvals[i+j] = VM_PAGER_OK;
 
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		PCPU_INC(cnt.v_swapout);
 		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		if (sync == FALSE) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_PEND;
 			/* restart outter loop */
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete, then update rtvals.
 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
 		 * our async completion routine at the end, thus avoiding a
 		 * double-free.
 		 */
 		bwait(bp, PVM, "swwrt");
 		for (j = 0; j < n; ++j)
 			rtvals[i+j] = VM_PAGER_PEND;
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	VM_OBJECT_WLOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * report error
 	 */
 	if (bp->b_ioflags & BIO_ERROR) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	if ((bp->b_flags & B_UNMAPPED) != 0) {
 		bp->b_data = bp->b_kvaalloc;
 		bp->b_kvabase = bp->b_kvaalloc;
 		bp->b_flags &= ~B_UNMAPPED;
 	} else
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_WLOCK(object);
 	}
 
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 		if (m->oflags & VPO_SWAPSLEEP) {
 			m->oflags &= ~VPO_SWAPSLEEP;
 			wakeup(&object->paging_in_progress);
 		}
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * When reading, reqpage needs to stay
 				 * locked for the parent, but all other
 				 * pages can be freed.  We still want to
 				 * wakeup the parent waiting on the page,
 				 * though.  ( also: pg_reqpage can be -1 and
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
 				 * up too because we cleared VPO_SWAPINPROG and
 				 * someone may be waiting for that.
 				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
 					swp_pager_free_nrpage(m);
 				else {
 					vm_page_lock(m);
 					vm_page_flash(m);
 					vm_page_unlock(m);
 				}
 				/*
 				 * If i == bp->b_pager.pg_reqpage, do not wake
 				 * the page up.  The caller needs to.
 				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				vm_page_dirty(m);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
 				vm_page_sunbusy(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 *
 			 * If not the requested page then deactivate it.
 			 *
 			 * Note that the requested page, reqpage, is left
 			 * busied, but we still have to wake it up.  The
 			 * other pages are released (unbusied) by
 			 * vm_page_xunbusy().
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
 			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
 			/*
 			 * We have to wake specifically requested pages
 			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
 			 * requested page - getpages expects it to be
 			 * left busy.
 			 */
 			if (i != bp->b_pager.pg_reqpage) {
 				vm_page_lock(m);
 				vm_page_deactivate(m);
 				vm_page_unlock(m);
 				vm_page_xunbusy(m);
 			} else {
 				vm_page_lock(m);
 				vm_page_flash(m);
 				vm_page_unlock(m);
 			}
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
 			vm_page_sunbusy(m);
 			if (vm_page_count_severe()) {
 				vm_page_lock(m);
 				vm_page_try_to_cache(m);
 				vm_page_unlock(m);
 			}
 		}
 	}
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_WUNLOCK(object);
 	}
 
 	/*
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	relpbuf(
 	    bp,
 	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
 		((bp->b_flags & B_ASYNC) ?
 		    &nsw_wcount_async :
 		    &nsw_wcount_sync
 		)
 	    )
 	);
 }
 
 /*
  *	swap_pager_isswapped:
  *
  *	Return 1 if at least one page in the given object is paged
  *	out to the given swap device.
  *
  *	This routine may not sleep.
  */
 int
 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 {
 	daddr_t index = 0;
 	int bcount;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return (0);
 
 	mtx_lock(&swhash_mtx);
 	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 		struct swblock *swap;
 
 		if ((swap = *swp_pager_hash(object, index)) != NULL) {
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 					mtx_unlock(&swhash_mtx);
 					return (1);
 				}
 			}
 		}
 		index += SWAP_META_PAGES;
 	}
 	mtx_unlock(&swhash_mtx);
 	return (0);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
  *	This routine dissociates the page at the given index within a
  *	swap block from its backing store, paging it in if necessary.
  *	If the page is paged in, it is placed in the inactive queue,
  *	since it had its backing store ripped out from under it.
  *	We also attempt to swap in all other pages in the swap block,
  *	we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
  *	      revert to the one-by-one behavior for now.  Sigh.
  */
 static inline void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid == VM_PAGE_BITS_ALL) {
 		vm_object_pip_subtract(object, 1);
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_activate(m);
 		vm_page_unlock(m);
 		vm_page_xunbusy(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
 
 	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 	vm_object_pip_subtract(object, 1);
 	vm_page_dirty(m);
 	vm_page_lock(m);
 	vm_page_deactivate(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	vm_pager_page_unswapped(m);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	struct swblock *swap;
 	int i, j, retries;
 
 	GIANT_REQUIRED;
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&swhash_mtx);
 	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 restart:
 		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 			vm_object_t object = swap->swb_object;
 			vm_pindex_t pindex = swap->swb_index;
 			for (j = 0; j < SWAP_META_PAGES; ++j) {
 				if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 					/* avoid deadlock */
 					if (!VM_OBJECT_TRYWLOCK(object)) {
 						break;
 					} else {
 						mtx_unlock(&swhash_mtx);
 						swp_pager_force_pagein(object,
 						    pindex + j);
 						VM_OBJECT_WUNLOCK(object);
 						mtx_lock(&swhash_mtx);
 						goto restart;
 					}
 				}
 			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is freed.
  */
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	static volatile int exhausted;
 	struct swblock *swap;
 	struct swblock **pswap;
 	int idx;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		object->type = OBJT_SWAP;
 		object->un_pager.swp.swp_bcount = 0;
 
 		if (object->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_INSERT_TAIL(
 			    NOBJLIST(object->handle),
 			    object,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * Locate hash entry.  If not found create, but if we aren't adding
 	 * anything just return.  If we run out of space in the map we wait
 	 * and, since the hash table may have changed, retry.
 	 */
 retry:
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) == NULL) {
 		int i;
 
 		if (swapblk == SWAPBLK_NONE)
 			goto done;
 
 		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
 		    (curproc == pageproc ? M_USE_RESERVE : 0));
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swap_zone)) {
 				if (atomic_cmpset_int(&exhausted, 0, 1))
 					printf("swap zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonex", 10);
 			} else
 				VM_WAIT;
 			VM_OBJECT_WLOCK(object);
 			goto retry;
 		}
 
 		if (atomic_cmpset_int(&exhausted, 1, 0))
 			printf("swap zone ok\n");
 
 		swap->swb_hnext = NULL;
 		swap->swb_object = object;
 		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 		swap->swb_count = 0;
 
 		++object->un_pager.swp.swp_bcount;
 
 		for (i = 0; i < SWAP_META_PAGES; ++i)
 			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
 	/*
 	 * Delete prior contents of metadata
 	 */
 	idx = pindex & SWAP_META_MASK;
 
 	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 		--swap->swb_count;
 	}
 
 	/*
 	 * Enter block into metadata
 	 */
 	swap->swb_pages[idx] = swapblk;
 	if (swapblk != SWAPBLK_NONE)
 		++swap->swb_count;
 done:
 	mtx_unlock(&swhash_mtx);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (count > 0) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 
 		if ((swap = *pswap) != NULL) {
 			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
 			if (v != SWAPBLK_NONE) {
 				swp_pager_freeswapspace(v, 1);
 				swap->swb_pages[index & SWAP_META_MASK] =
 					SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 			--count;
 			++index;
 		} else {
 			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 			count -= n;
 			index += n;
 		}
 		mtx_unlock(&swhash_mtx);
 	}
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	daddr_t index = 0;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (object->un_pager.swp.swp_bcount) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 		if ((swap = *pswap) != NULL) {
 			int i;
 
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				daddr_t v = swap->swb_pages[i];
 				if (v != SWAPBLK_NONE) {
 					--swap->swb_count;
 					swp_pager_freeswapspace(v, 1);
 				}
 			}
 			if (swap->swb_count != 0)
 				panic("swap_pager_meta_free_all: swb_count != 0");
 			*pswap = swap->swb_hnext;
 			uma_zfree(swap_zone, swap);
 			--object->un_pager.swp.swp_bcount;
 		}
 		mtx_unlock(&swhash_mtx);
 		index += SWAP_META_PAGES;
 	}
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
  *
  *	This routine is capable of looking up, popping, or freeing
  *	swapblk assignments in the swap meta data or in the vm_page_t.
  *	The routine typically returns the swapblk being looked-up, or popped,
  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
  *	was invalid.  This routine will automatically free any invalid
  *	meta-data swapblks.
  *
  *	It is not possible to store invalid swapblks in the swap meta data
  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
  *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  *
  *	SWM_FREE	remove and free swap block from metadata
  *	SWM_POP		remove from meta data but do not free.. pop it out
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 	daddr_t r1;
 	int idx;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	/*
 	 * The meta data only exists of the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	r1 = SWAPBLK_NONE;
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) != NULL) {
 		idx = pindex & SWAP_META_MASK;
 		r1 = swap->swb_pages[idx];
 
 		if (r1 != SWAPBLK_NONE) {
 			if (flags & SWM_FREE) {
 				swp_pager_freeswapspace(r1, 1);
 				r1 = SWAPBLK_NONE;
 			}
 			if (flags & (SWM_FREE|SWM_POP)) {
 				swap->swb_pages[idx] = SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	return (r1);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 	swdev_syscall_active = 1;
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(td, vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
  * message and return -1; otherwise, return 0.
  */
 static int
 swapon_check_swzone(unsigned long npages)
 {
 	unsigned long maxpages;
 
 	/* absolute maximum we can handle assuming 100% efficiency */
 	maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
 		printf("warning: total configured swap (%lu pages) "
 		    "exceeds maximum recommended amount (%lu pages).\n",
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
 		return (-1);
 	}
 	return (0);
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks,
     sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf(
     "WARNING: reducing swap size to maximum of %luMB per unit\n",
 		    mblocks / 1024 / 1024 * PAGE_SIZE);
 		nblks = mblocks;
 	}
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 	sp->sw_flags = flags;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks;
 	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
 	swapon_check_swzone(swap_total / PAGE_SIZE);
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
 	u_long nblks, dvbase;
 #ifdef MAC
 	int error;
 #endif
 
 	mtx_assert(&Giant, MA_OWNED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
 	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 	    nblks + nswap_lowat) {
 		return (ENOMEM);
 	}
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 		swap_pager_avail -= blist_fill(sp->sw_blist,
 		     dvbase, dmmax);
 	}
 	swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	sp->sw_id = NULL;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 int
 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
 {
 	struct swdevt *sp;
 	const char *tmp_devname;
 	int error, n;
 
 	n = 0;
 	error = ENOENT;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n != name) {
 			n++;
 			continue;
 		}
 		xs->xsw_version = XSWDEV_VERSION;
 		xs->xsw_dev = sp->sw_dev;
 		xs->xsw_flags = sp->sw_flags;
 		xs->xsw_nblks = sp->sw_nblks;
 		xs->xsw_used = sp->sw_used;
 		if (devname != NULL) {
 			if (vn_isdisk(sp->sw_vp, NULL))
 				tmp_devname = devtoname(sp->sw_vp->v_rdev);
 			else
 				tmp_devname = "[file]";
 			strncpy(devname, tmp_devname, len);
 		}
 		error = 0;
 		break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (error);
 }
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	struct xswdev xs;
 	int error;
 
 	if (arg2 != 1)			/* name length */
 		return (EINVAL);
 	error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
 	if (error != 0)
 		return (error);
 	error = SYSCTL_OUT(req, &xs, sizeof(xs));
 	return (error);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * vmspace_swap_count() - count the approximate swap usage in pages for a
  *			  vmspace.
  *
  *	The map must be locked.
  *
  *	Swap usage is determined by taking the proportional swap used by
  *	VM objects backing the VM map.  To make up for fractional losses,
  *	if the VM object has any swap use at all the associated map entries
  *	count for at least 1 swap page.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	long count, n;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 		    (object = cur->object.vm_object) != NULL) {
 			VM_OBJECT_WLOCK(object);
 			if (object->type == OBJT_SWAP &&
 			    object->un_pager.swp.swp_bcount != 0) {
 				n = (cur->end - cur->start) / PAGE_SIZE;
 				count += object->un_pager.swp.swp_bcount *
 				    SWAP_META_PAGES * n / object->size + 1;
 			}
 			VM_OBJECT_WUNLOCK(object);
 		}
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct swdevt *sp;
 	struct buf *bp;
 	struct g_consumer *cp;
 	int destroy;
 
 	bp = bp2->bio_caller2;
 	cp = bp2->bio_from;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bufdone(bp);
 	mtx_lock(&sw_dev_mtx);
 	destroy = ((--cp->index) == 0 && cp->private);
 	if (destroy) {
 		sp = bp2->bio_caller1;
 		sp->sw_id = NULL;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	g_destroy_bio(bp2);
 	if (destroy)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	cp->index++;
 	mtx_unlock(&sw_dev_mtx);
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 
 	bio->bio_caller1 = sp;
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	if ((bp->b_flags & B_UNMAPPED) != 0) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bio->bio_ma_n = bp->b_npages;
 		bio->bio_flags |= BIO_UNMAPPED;
 	} else {
 		bio->bio_data = bp->b_data;
 		bio->bio_ma = NULL;
 	}
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 	int destroy;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == cp) {
 			sp->sw_flags |= SW_CLOSING;
 			break;
 		}
 	}
 	cp->private = (void *)(uintptr_t)1;
 	destroy = ((sp != NULL) && (cp->index == 0));
 	if (destroy)
 		sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	if (destroy)
 		swapgeom_close_ev(cp, 0);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	/* XXX: direct call when Giant untangled */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 
 struct swh0h0 {
 	struct cdev *dev;
 	struct vnode *vp;
 	int	error;
 };
 
 static void
 swapongeom_ev(void *arg, int flags)
 {
 	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	swh = arg;
 	swh->error = 0;
 	pp = g_dev_getprovider(swh->dev);
 	if (pp == NULL) {
 		swh->error = ENODEV;
 		return;
 	}
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			swh->error = EBUSY;
 			return;
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
 	cp->index = 0;		/* Number of active I/Os. */
 	cp->private = NULL;	/* Orphanization flag */
 	g_attach(cp, pp);
 	/*
 	 * XXX: Everytime you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		swh->error = error;
 		return;
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(swh->dev),
 	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
 	swh->error = 0;
 }
 
 static int
 swapongeom(struct thread *td, struct vnode *vp)
 {
 	int error;
 	struct swh0h0 swh;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	swh.dev = vp->v_rdev;
 	swh.vp = vp;
 	swh.error = 0;
 	/* XXX: direct call when Giant untangled */
 	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 	if (!error)
 		error = swh.error;
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV, 0);
 	return (0);
 }
Index: stable/10/sys/vm/vm_map.c
===================================================================
--- stable/10/sys/vm/vm_map.c	(revision 284664)
+++ stable/10/sys/vm/vm_map.c	(revision 284665)
@@ -1,4297 +1,4306 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a self-adjusting binary search tree of these
  *	entries is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
 static int vmspace_zinit(void *mem, int size, int flags);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
     int cow);
 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr);
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 	PMAP_LOCK_INIT(vmspace_pmap(vm));
 	return (0);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	memset(map, 0, sizeof(*map));
 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  *
  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
  */
 struct vmspace *
 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
 
 	if (pinit == NULL)
 		pinit = &pmap_pinit;
 
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
+#ifdef RACCT
 static void
 vmspace_container_reset(struct proc *p)
 {
 
-#ifdef RACCT
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
 	racct_set(p, RACCT_RSS, 0);
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
-#endif
 }
+#endif
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 	    vm->vm_map.max_offset);
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "vmspace_free() called with non-sleepable lock held");
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
-	vmspace_container_reset(p);
+#ifdef RACCT
+	if (racct_enable)
+		vmspace_container_reset(p);
+#endif
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xlock_(&map->lock, file, line);
 	map->timestamp++;
 }
 
 static void
 vm_map_process_deferred(void)
 {
 	struct thread *td;
 	vm_map_entry_t entry, next;
 	vm_object_t object;
 
 	td = curthread;
 	entry = td->td_map_def_user;
 	td->td_map_def_user = NULL;
 	while (entry != NULL) {
 		next = entry->next;
 		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
 			/*
 			 * Decrement the object's writemappings and
 			 * possibly the vnode's v_writecount.
 			 */
 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 			    ("Submap with writecount"));
 			object = entry->object.vm_object;
 			KASSERT(object != NULL, ("No object for writecount"));
 			vnode_pager_release_writecount(object, entry->start,
 			    entry->end);
 		}
 		vm_map_entry_deallocate(entry, FALSE);
 		entry = next;
 	}
 }
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_slock_(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_xlock_(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_slock_(&map->lock, file, line);
 	return (error == 0);
 }
 
 /*
  *	_vm_map_lock_upgrade:	[ internal use only ]
  *
  *	Tries to upgrade a read (shared) lock on the specified map to a write
  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
  *	returned without a read or write lock held.
  *
  *	Requires that the map be read locked.
  */
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 	unsigned int last_timestamp;
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		if (!sx_try_upgrade_(&map->lock, file, line)) {
 			last_timestamp = map->timestamp;
 			sx_sunlock_(&map->lock, file, line);
 			vm_map_process_deferred();
 			/*
 			 * If the map's timestamp does not change while the
 			 * map is unlocked, then the upgrade succeeds.
 			 */
 			sx_xlock_(&map->lock, file, line);
 			if (last_timestamp != map->timestamp) {
 				sx_xunlock_(&map->lock, file, line);
 				return (1);
 			}
 		}
 	}
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		sx_downgrade_(&map->lock, file, line);
 }
 
 /*
  *	vm_map_locked:
  *
  *	Returns a non-zero value if the caller holds a write (exclusive) lock
  *	on the specified map and the value "0" otherwise.
  */
 int
 vm_map_locked(vm_map_t map)
 {
 
 	if (map->system_map)
 		return (mtx_owned(&map->system_mtx));
 	else
 		return (sx_xlocked(&map->lock));
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	else
 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
 }
 
 #define	VM_MAP_ASSERT_LOCKED(map) \
     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
 #else
 #define	VM_MAP_ASSERT_LOCKED(map)
 #endif
 
 /*
  *	_vm_map_unlock_and_wait:
  *
  *	Atomically releases the lock on the specified map and puts the calling
  *	thread to sleep.  The calling thread will remain asleep until either
  *	vm_map_wakeup() is performed on the map or the specified timeout is
  *	exceeded.
  *
  *	WARNING!  This function does not perform deferred deallocations of
  *	objects and map	entries.  Therefore, the calling thread is expected to
  *	reacquire the map lock after reawakening and later perform an ordinary
  *	unlock operation, such as vm_map_unlock(), before completing its
  *	operation on the map.
  */
 int
 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 {
 
 	mtx_lock(&map_sleep_mtx);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xunlock_(&map->lock, file, line);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
 
 /*
  *	vm_map_wakeup:
  *
  *	Awaken any threads that have slept on the map using
  *	vm_map_unlock_and_wait().
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the map unlock
 	 * and the msleep() in _vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 void
 vm_map_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	map->busy++;
 }
 
 void
 vm_map_unbusy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 		wakeup(&map->busy);
 	}
 }
 
 void 
 vm_map_wait_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	while (map->busy) {
 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 		if (map->system_map)
 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 		else
 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 	}
 	map->timestamp++;
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, pmap, min, max);
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  */
 static void
 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.next = map->header.prev = &map->header;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
 	map->busy = 0;
 }
 
 void
 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	_vm_map_init(map, pmap, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_set_max_free:
  *
  *	Set the max_free field in a vm_map_entry.
  */
 static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
 
 	entry->max_free = entry->adj_free;
 	if (entry->left != NULL && entry->left->max_free > entry->max_free)
 		entry->max_free = entry->left->max_free;
 	if (entry->right != NULL && entry->right->max_free > entry->max_free)
 		entry->max_free = entry->right->max_free;
 }
 
 /*
  *	vm_map_entry_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower or higher) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
 {
 	vm_map_entry_t llist, rlist;
 	vm_map_entry_t ltree, rtree;
 	vm_map_entry_t y;
 
 	/* Special case of empty tree. */
 	if (root == NULL)
 		return (root);
 
 	/*
 	 * Pass One: Splay down the tree until we find addr or a NULL
 	 * pointer where addr would go.  llist and rlist are the two
 	 * sides in reverse order (bottom-up), with llist linked by
 	 * the right pointer and rlist linked by the left pointer in
 	 * the vm_map_entry.  Wait until Pass Two to set max_free on
 	 * the two spines.
 	 */
 	llist = NULL;
 	rlist = NULL;
 	for (;;) {
 		/* root is never NULL in here. */
 		if (addr < root->start) {
 			y = root->left;
 			if (y == NULL)
 				break;
 			if (addr < y->start && y->left != NULL) {
 				/* Rotate right and put y on rlist. */
 				root->left = y->right;
 				y->right = root;
 				vm_map_entry_set_max_free(root);
 				root = y->left;
 				y->left = rlist;
 				rlist = y;
 			} else {
 				/* Put root on rlist. */
 				root->left = rlist;
 				rlist = root;
 				root = y;
 			}
 		} else if (addr >= root->end) {
 			y = root->right;
 			if (y == NULL)
 				break;
 			if (addr >= y->end && y->right != NULL) {
 				/* Rotate left and put y on llist. */
 				root->right = y->left;
 				y->left = root;
 				vm_map_entry_set_max_free(root);
 				root = y->right;
 				y->right = llist;
 				llist = y;
 			} else {
 				/* Put root on llist. */
 				root->right = llist;
 				llist = root;
 				root = y;
 			}
 		} else
 			break;
 	}
 
 	/*
 	 * Pass Two: Walk back up the two spines, flip the pointers
 	 * and set max_free.  The subtrees of the root go at the
 	 * bottom of llist and rlist.
 	 */
 	ltree = root->left;
 	while (llist != NULL) {
 		y = llist->right;
 		llist->right = ltree;
 		vm_map_entry_set_max_free(llist);
 		ltree = llist;
 		llist = y;
 	}
 	rtree = root->right;
 	while (rlist != NULL) {
 		y = rlist->left;
 		rlist->left = rtree;
 		vm_map_entry_set_max_free(rlist);
 		rtree = rlist;
 		rlist = y;
 	}
 
 	/*
 	 * Final assembly: add ltree and rtree as subtrees of root.
 	 */
 	root->left = ltree;
 	root->right = rtree;
 	vm_map_entry_set_max_free(root);
 
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static void
 vm_map_entry_link(vm_map_t map,
 		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
 
 	CTR4(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(after_where == &map->header ||
 	    after_where->end <= entry->start,
 	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
 	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
 	KASSERT(after_where->next == &map->header ||
 	    entry->end <= after_where->next->start,
 	    ("vm_map_entry_link: new end %jx next start %jx overlap",
 	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
 
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
 	entry->next->prev = entry;
 	after_where->next = entry;
 
 	if (after_where != &map->header) {
 		if (after_where != map->root)
 			vm_map_entry_splay(after_where->start, map->root);
 		entry->right = after_where->right;
 		entry->left = after_where;
 		after_where->right = NULL;
 		after_where->adj_free = entry->start - after_where->end;
 		vm_map_entry_set_max_free(after_where);
 	} else {
 		entry->right = map->root;
 		entry->left = NULL;
 	}
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
 
 static void
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev, root;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if (entry != map->root)
 		vm_map_entry_splay(entry->start, map->root);
 	if (entry->left == NULL)
 		root = entry->right;
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
 		root->adj_free = (entry->next == &map->header ? map->max_offset :
 		    entry->next->start) - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
 
 	prev = entry->prev;
 	next = entry->next;
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize_free:
  *
  *	Recompute the amount of free space following a vm_map_entry
  *	and propagate that value up the tree.  Call this function after
  *	resizing a map entry in-place, that is, without a call to
  *	vm_map_entry_link() or _unlink().
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * Using splay trees without parent pointers, propagating
 	 * max_free up the tree is done by moving the entry to the
 	 * root and making the change there.
 	 */
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur;
 	boolean_t locked;
 
 	/*
 	 * If the map is empty, then the map entry immediately preceding
 	 * "address" is the map's header.
 	 */
 	cur = map->root;
 	if (cur == NULL)
 		*entry = &map->header;
 	else if (address >= cur->start && cur->end > address) {
 		*entry = cur;
 		return (TRUE);
 	} else if ((locked = vm_map_locked(map)) ||
 	    sx_try_upgrade(&map->lock)) {
 		/*
 		 * Splay requires a write lock on the map.  However, it only
 		 * restructures the binary search tree; it does not otherwise
 		 * change the map.  Thus, the map's timestamp need not change
 		 * on a temporary upgrade.
 		 */
 		map->root = cur = vm_map_entry_splay(address, cur);
 		if (!locked)
 			sx_downgrade(&map->lock);
 
 		/*
 		 * If "address" is contained within a map entry, the new root
 		 * is that map entry.  Otherwise, the new root is a map entry
 		 * immediately before or after "address".
 		 */
 		if (address >= cur->start) {
 			*entry = cur;
 			if (cur->end > address)
 				return (TRUE);
 		} else
 			*entry = cur->prev;
 	} else
 		/*
 		 * Since the map is only locked for read access, perform a
 		 * standard binary search tree lookup for "address".
 		 */
 		for (;;) {
 			if (address < cur->start) {
 				if (cur->left == NULL) {
 					*entry = cur->prev;
 					break;
 				}
 				cur = cur->left;
 			} else if (cur->end > address) {
 				*entry = cur;
 				return (TRUE);
 			} else {
 				if (cur->right == NULL) {
 					*entry = cur;
 					break;
 				}
 				cur = cur->right;
 			}
 		}
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 	      int cow)
 {
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 	vm_eflags_t protoeflags;
 	struct ucred *cred;
 	vm_inherit_t inheritance;
 	boolean_t charge_prev_obj;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	protoeflags = 0;
 	charge_prev_obj = FALSE;
 
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 
 	if (cow & MAP_NOFAULT) {
 		protoeflags |= MAP_ENTRY_NOFAULT;
 
 		KASSERT(object == NULL,
 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 	if (cow & MAP_VN_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
 		inheritance = VM_INHERIT_DEFAULT;
 
 	cred = NULL;
 	KASSERT((object != kmem_object && object != kernel_object) ||
 	    ((object == kmem_object || object == kernel_object) &&
 		!(protoeflags & MAP_ENTRY_NEEDS_COPY)),
 	    ("kmem or kernel object and cow"));
 	if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
 		KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
 		    object->cred == NULL,
 		    ("OVERCOMMIT: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
 		crhold(cred);
 		if (object == NULL && !(protoeflags & MAP_ENTRY_NEEDS_COPY))
 			charge_prev_obj = TRUE;
 	}
 
 charged:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map && end > kernel_vm_end)
 		pmap_growkernel(end);
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(object);
 	}
 	else if ((prev_entry != &map->header) &&
 		 (prev_entry->eflags == protoeflags) &&
 		 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
 		 (prev_entry->end == start) &&
 		 (prev_entry->wired_count == 0) &&
 		 (prev_entry->cred == cred ||
 		  (prev_entry->object.vm_object != NULL &&
 		   (prev_entry->object.vm_object->cred == cred))) &&
 		   vm_object_coalesce(prev_entry->object.vm_object,
 		       prev_entry->offset,
 		       (vm_size_t)(prev_entry->end - prev_entry->start),
 		       (vm_size_t)(end - prev_entry->end), charge_prev_obj)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if ((prev_entry->inheritance == inheritance) &&
 		    (prev_entry->protection == prot) &&
 		    (prev_entry->max_protection == max)) {
 			map->size += (end - prev_entry->end);
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
 			if (cred != NULL)
 				crfree(cred);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 			(prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
 			crfree(cred);
 			cred = NULL;
 		}
 	}
 
 	/*
 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
 	 * in things like the buffer map where we manage kva but do not manage
 	 * backing objects.
 	 */
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 	new_entry->cred = NULL;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = OFF_TO_IDX(offset);
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * It may be possible to merge the new entry with the next and/or
 	 * previous entries.  However, due to MAP_STACK_* being a hack, a
 	 * panic can result from merging such entries.
 	 */
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)
 		vm_map_simplify_entry(map, new_entry);
 
 	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 		vm_map_pmap_enter(map, start, prot,
 				    object, OFF_TO_IDX(offset), end - start,
 				    cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "adj_free" is the amount of free space
  *	adjacent (higher address) to this entry, and "max_free" is the
  *	maximum amount of contiguous free space in its subtree.  This
  *	allows finding a free region in one path down the tree, so
  *	O(log n) amortized with splay trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: 0 on success, and starting address in *addr,
  *		 1 if insufficient space.
  */
 int
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
     vm_offset_t *addr)	/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_offset_t st;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start + length > map->max_offset || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL) {
 		*addr = start;
 		return (0);
 	}
 
 	/*
 	 * After splay, if start comes before root node, then there
 	 * must be a gap from start to the root.
 	 */
 	map->root = vm_map_entry_splay(start, map->root);
 	if (start + length <= map->root->start) {
 		*addr = start;
 		return (0);
 	}
 
 	/*
 	 * Root is the last node that might begin its gap before
 	 * start, and this is the last comparison where address
 	 * wrap might be a problem.
 	 */
 	st = (start > map->root->end) ? start : map->root->end;
 	if (length <= map->root->end + map->root->adj_free - st) {
 		*addr = st;
 		return (0);
 	}
 
 	/* With max_free, can immediately tell if no solution. */
 	entry = map->root->right;
 	if (entry == NULL || length > entry->max_free)
 		return (1);
 
 	/*
 	 * Search the right subtree in the order: left subtree, root,
 	 * right subtree (first fit).  The previous splay implies that
 	 * all regions in the right subtree have addresses > start.
 	 */
 	while (entry != NULL) {
 		if (entry->left != NULL && entry->left->max_free >= length)
 			entry = entry->left;
 		else if (entry->adj_free >= length) {
 			*addr = entry->end;
 			return (0);
 		} else
 			entry = entry->right;
 	}
 
 	/* Can't get here, so panic if we do. */
 	panic("vm_map_findspace: max_free corrupt");
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t end;
 	int result;
 
 	end = start + length;
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_fixed: non-NULL backing object for stack"));
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if ((cow & MAP_CHECK_EXCL) == 0)
 		vm_map_delete(map, start, end);
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
 		    prot, max, cow);
 	} else {
 		result = vm_map_insert(map, object, offset, start, end,
 		    prot, max, cow);
 	}
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, vm_offset_t max_addr, int find_space,
 	    vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_offset_t alignment, initial_addr, start;
 	int result;
 
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
 	if (find_space >> 8 != 0) {
 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
 		alignment = (vm_offset_t)1 << (find_space >> 8);
 	} else
 		alignment = 0;
 	initial_addr = *addr;
 again:
 	start = initial_addr;
 	vm_map_lock(map);
 	do {
 		if (find_space != VMFS_NO_SPACE) {
 			if (vm_map_findspace(map, start, length, addr) ||
 			    (max_addr != 0 && *addr + length > max_addr)) {
 				vm_map_unlock(map);
 				if (find_space == VMFS_OPTIMAL_SPACE) {
 					find_space = VMFS_ANY_SPACE;
 					goto again;
 				}
 				return (KERN_NO_SPACE);
 			}
 			switch (find_space) {
 			case VMFS_SUPER_SPACE:
 			case VMFS_OPTIMAL_SPACE:
 				pmap_align_superpage(object, offset, addr,
 				    length);
 				break;
 			case VMFS_ANY_SPACE:
 				break;
 			default:
 				if ((*addr & (alignment - 1)) != 0) {
 					*addr &= ~(alignment - 1);
 					*addr += alignment;
 				}
 				break;
 			}
 
 			start = *addr;
 		}
 		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 			result = vm_map_stack_locked(map, start, length,
 			    sgrowsiz, prot, max, cow);
 		} else {
 			result = vm_map_insert(map, object, offset, start,
 			    start + length, prot, max, cow);
 		}
 	} while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
 	    find_space != VMFS_ANY_SPACE);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
  *	routine also has the ability to merge with both neighbors.
  *
  *	The map must be locked.
  *
  *	This routine guarentees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
 void
 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
 	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
 		return;
 
 	prev = entry->prev;
 	if (prev != &map->header) {
 		prevsize = prev->end - prev->start;
 		if ( (prev->end == entry->start) &&
 		     (prev->object.vm_object == entry->object.vm_object) &&
 		     (!prev->object.vm_object ||
 			(prev->offset + prevsize == entry->offset)) &&
 		     (prev->eflags == entry->eflags) &&
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
 		     (prev->wired_count == entry->wired_count) &&
 		     (prev->cred == entry->cred)) {
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
 			if (entry->prev != &map->header)
 				vm_map_entry_resize_free(map, entry->prev);
 
 			/*
 			 * If the backing object is a vnode object,
 			 * vm_object_deallocate() calls vrele().
 			 * However, vrele() does not lock the vnode
 			 * because the vnode has additional
 			 * references.  Thus, the map lock can be kept
 			 * without causing a lock-order reversal with
 			 * the vnode lock.
 			 *
 			 * Since we count the number of virtual page
 			 * mappings in object->un_pager.vnp.writemappings,
 			 * the writemappings value should not be adjusted
 			 * when the entry is disposed of.
 			 */
 			if (prev->object.vm_object)
 				vm_object_deallocate(prev->object.vm_object);
 			if (prev->cred != NULL)
 				crfree(prev->cred);
 			vm_map_entry_dispose(map, prev);
 		}
 	}
 
 	next = entry->next;
 	if (next != &map->header) {
 		esize = entry->end - entry->start;
 		if ((entry->end == next->start) &&
 		    (next->object.vm_object == entry->object.vm_object) &&
 		     (!entry->object.vm_object ||
 			(entry->offset + esize == next->offset)) &&
 		    (next->eflags == entry->eflags) &&
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
 		    (next->wired_count == entry->wired_count) &&
 		    (next->cred == entry->cred)) {
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			vm_map_entry_resize_free(map, entry);
 
 			/*
 			 * See comment above.
 			 */
 			if (next->object.vm_object)
 				vm_object_deallocate(next->object.vm_object);
 			if (next->cred != NULL)
 				crfree(next->cred);
 			vm_map_entry_dispose(map, next);
 		}
 	}
 }
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 		/*
 		 * The object->un_pager.vnp.writemappings for the
 		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
 		 * kept as is here.  The virtual pages are
 		 * re-distributed among the clipped entries, so the sum is
 		 * left the same.
 		 */
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if ((endaddr) < (entry->end)) \
 		_vm_map_clip_end((map), (entry), (endaddr)); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			object->cred = entry->cred;
 			object->charge = entry->end - entry->start;
 			entry->cred = NULL;
 		}
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload the specified map's pmap with mappings to the specified
  *	object's memory-resident pages.  No further physical pages are
  *	allocated, and no further virtual pages are retrieved from secondary
  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
  *	the specified address range are mapped.  Because these mappings are
  *	being created speculatively, cached pages are not reactivated and
  *	mapped.
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t mask, psize, threshold, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	VM_OBJECT_RLOCK(object);
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 		VM_OBJECT_RUNLOCK(object);
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 			pmap_object_init_pt(map->pmap, addr, object, pindex,
 			    size);
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	}
 
 	psize = atop(size);
 	if (psize + pindex > object->size) {
 		if (object->size < pindex) {
 			VM_OBJECT_RUNLOCK(object);
 			return;
 		}
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
 	threshold = MAX_INIT_PT;
 
 	p = vm_page_find_least(object, pindex);
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
 		    cnt.v_free_count < cnt.v_free_reserved) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
 			break;
 		}
 		if (p->valid == VM_PAGE_BITS_ALL) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 			/* Jump ahead if a superpage mapping is possible. */
 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
 				    vm_page_ps_is_valid(p)) {
 					p += mask;
 					threshold += mask;
 				}
 			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current, entry;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 
 	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
 	for (current = entry; (current != &map->header) &&
 	     (current->start < end); current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
 		if (set_max ||
 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
 		    ENTRY_CHARGED(current)) {
 			continue;
 		}
 
 		cred = curthread->td_ucred;
 		obj = current->object.vm_object;
 
 		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			if (!swap_reserve(current->end - current->start)) {
 				vm_map_unlock(map);
 				return (KERN_RESOURCE_SHORTAGE);
 			}
 			crhold(cred);
 			current->cred = cred;
 			continue;
 		}
 
 		VM_OBJECT_WLOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
 			VM_OBJECT_WUNLOCK(obj);
 			continue;
 		}
 
 		/*
 		 * Charge for the whole object allocation now, since
 		 * we cannot distinguish between non-charged and
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
 		    ("vm_map_protect: object %p overcharged (entry %p)",
 		    obj, current));
 		if (!swap_reserve(ptoa(obj->size))) {
 			VM_OBJECT_WUNLOCK(obj);
 			vm_map_unlock(map);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		old_prot = current->protection;
 
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * For user wired map entries, the normal lazy evaluation of
 		 * write access upgrades through soft page faults is
 		 * undesirable.  Instead, immediately copy any pages that are
 		 * copy-on-write and enable write access in the physical map.
 		 */
 		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
 		    (current->protection & VM_PROT_WRITE) != 0 &&
 		    (old_prot & VM_PROT_WRITE) == 0)
 			vm_fault_copy_entry(map, map, current, current, NULL);
 
 		/*
 		 * When restricting access, update the physical map.  Worry
 		 * about copy-on-write here.
 		 */
 		if ((old_prot & ~current->protection) != 0) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
 		current = current->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t current, entry;
 	int modify_map = 0;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		if (start == end)
 			return (KERN_SUCCESS);
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		if (start == end)
 			return (KERN_SUCCESS);
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pstart, pend;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			vm_offset_t useEnd, useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
 			useEnd = current->end;
 
 			if (current->start < start) {
 				pstart += atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end) {
 				pend -= atop(current->end - end);
 				useEnd = end;
 			}
 
 			if (pstart >= pend)
 				continue;
 
 			/*
 			 * Perform the pmap_advise() before clearing
 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
 			 * concurrent pmap operation, such as pmap_remove(),
 			 * could clear a reference in the pmap and set
 			 * PGA_REFERENCED on the page before the pmap_advise()
 			 * had completed.  Consequently, the page would appear
 			 * referenced based upon an old reference that
 			 * occurred before this pmap_advise() ran.
 			 */
 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
 				pmap_advise(map->pmap, useStart, useEnd,
 				    behav);
 
 			vm_object_madvise(current->object.vm_object, pstart,
 			    pend, behav);
 
 			/*
 			 * Pre-populate paging structures in the
 			 * WILLNEED case.  For wired entries, the
 			 * paging structures are already populated.
 			 */
 			if (behav == MADV_WILLNEED &&
 			    current->wired_count == 0) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
 				    current->object.vm_object,
 				    pstart,
 				    ptoa(pend - pstart),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vmspace_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	if (start == end)
 		return (KERN_SUCCESS);
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 		entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user unwiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp+1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * First_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
 	for (entry = first_entry; entry != &map->header && entry->start < end;
 	    entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for draining
 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
 		 * could be simultaneously wiring this new mapping
 		 * entry.  Detect these cases and skip any entries
 		 * marked as in transition by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_unwire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_unwire: alien wire %p", entry));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire_entry_failure:
  *
  *	Handle a wiring failure on the given entry.
  *
  *	The map should be locked.
  */
 static void
 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
 	    entry->wired_count == 1,
 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
 	KASSERT(failed_addr < entry->end,
 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
 
 	/*
 	 * If any pages at the start of this entry were successfully wired,
 	 * then unwire them.
 	 */
 	if (failed_addr > entry->start) {
 		pmap_unwire(map->pmap, entry->start, failed_addr);
 		vm_object_unwire(entry->object.vm_object, entry->offset,
 		    failed_addr - entry->start, PQ_ACTIVE);
 	}
 
 	/*
 	 * Assign an out-of-range value to represent the failure to wire this
 	 * entry.
 	 */
 	entry->wired_count = -1;
 }
 
 /*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
  */
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t faddr, saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_wire;
 	vm_prot_t prot;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, 0)) {
 				/*
 				 * Allow interruption of user wiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * first_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
 			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
 				end = entry->end;
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 			goto next_entry;
 		}
 		if (entry->wired_count == 0) {
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
 
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
 			 */
 			vm_map_busy(map);
 			vm_map_unlock(map);
 
 			faddr = saved_start;
 			do {
 				/*
 				 * Simulate a fault to get the page and enter
 				 * it into the physical map.
 				 */
 				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
 				    VM_FAULT_CHANGE_WIRING)) != KERN_SUCCESS)
 					break;
 			} while ((faddr += PAGE_SIZE) < saved_end);
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				result = vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry);
 				KASSERT(result, ("vm_map_wire: lookup failed"));
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
 					/*
 					 * In case of failure, handle entries
 					 * that were not fully wired here;
 					 * fully wired entries are handled
 					 * later.
 					 */
 					if (rv != KERN_SUCCESS &&
 					    faddr < entry->end)
 						vm_map_wire_entry_failure(map,
 						    entry, faddr);
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
 				vm_map_wire_entry_failure(map, entry, faddr);
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 	next_entry:
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
 	for (entry = first_entry; entry != &map->header && entry->start < end;
 	    entry = entry->next) {
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
 			goto next_entry_done;
 
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for faulting in the
 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
 		 * Moreover, another thread could be simultaneously
 		 * wiring this new mapping entry.  Detect these cases
 		 * and skip any entries marked as in transition by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else if (!user_wire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			/*
 			 * Undo the wiring.  Wiring succeeded on this entry
 			 * but failed on a later entry.  
 			 */
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 		}
 	next_entry_done:
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_wire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_wire: alien wire %p", entry));
 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
 		    MAP_ENTRY_WIRE_SKIPPED);
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	unsigned int last_timestamp;
 	boolean_t failed;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = entry->start;
 		end = entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (current = entry; current != &map->header && current->start < end;
 	    current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 	failed = FALSE;
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current != &map->header && current->start < end;) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		vm_object_reference(object);
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
 			failed = TRUE;
 		start += size;
 		vm_object_deallocate(object);
 		vm_map_lock_read(map);
 		if (last_timestamp == map->timestamp ||
 		    !vm_map_lookup_entry(map, start, &current))
 			current = current->next;
 	}
 
 	vm_map_unlock_read(map);
 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->wired_count > 0,
 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
 	pmap_unwire(map->pmap, entry->start, entry->end);
 	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
 	    entry->start, PQ_ACTIVE);
 	entry->wired_count = 0;
 }
 
 static void
 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
 {
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
 		vm_object_deallocate(entry->object.vm_object);
 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count, size1;
 	vm_ooffset_t size;
 
 	vm_map_entry_unlink(map, entry);
 	object = entry->object.vm_object;
 	size = entry->end - entry->start;
 	map->size -= size;
 
 	if (entry->cred != NULL) {
 		swap_release_by_cred(size, entry->cred);
 		crfree(entry->cred);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 	    (object != NULL)) {
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
 		count = OFF_TO_IDX(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 
 			/*
 			 * The option OBJPR_NOTMAPPED can be passed here
 			 * because vm_map_delete() already performed
 			 * pmap_remove() on the only mapping to this range
 			 * of pages. 
 			 */
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
 			if (object->type == OBJT_SWAP)
 				swap_pager_freespace(object, offidxstart, count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
 				object->size = offidxstart;
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
 					    ("vm_map_entry_delete: object->charge < 0"));
 					swap_release_by_cred(ptoa(size1), object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 	} else
 		entry->object.vm_object = NULL;
 	if (map->system_map)
 		vm_map_entry_deallocate(entry, TRUE);
 	else {
 		entry->next = curthread->td_map_def_user;
 		curthread->td_map_def_user = entry;
 	}
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 			vm_map_entry_t tmp_entry;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, 0);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 							 &tmp_entry))
 					entry = tmp_entry->next;
 				else {
 					entry = tmp_entry;
 					vm_map_clip_start(map, entry,
 							  saved_start);
 				}
 			}
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0) {
 			vm_map_entry_unwire(map, entry);
 		}
 
 		pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry only after removing all pmap
 		 * entries pointing to its pages.  (Otherwise, its
 		 * page frames may be reallocated, and any modify bits
 		 * will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header)
 			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry,
 	vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	vm_map_entry_t fake_entry;
 	vm_offset_t size;
 	struct ucred *cred;
 	int charged;
 
 	VM_MAP_ASSERT_LOCKED(dst_map);
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0 ||
 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			VM_OBJECT_WLOCK(src_object);
 			charged = ENTRY_CHARGED(src_entry);
 			if ((src_object->handle == NULL) &&
 				(src_object->type == OBJT_DEFAULT ||
 				 src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
 					src_object = src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			if (src_entry->cred != NULL &&
 			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 				KASSERT(src_object->cred == NULL,
 				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
 				     src_object));
 				src_object->cred = src_entry->cred;
 				src_object->charge = size;
 			}
 			VM_OBJECT_WUNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			if (charged) {
 				cred = curthread->td_ucred;
 				crhold(cred);
 				dst_entry->cred = cred;
 				*fork_charge += size;
 				if (!(src_entry->eflags &
 				      MAP_ENTRY_NEEDS_COPY)) {
 					crhold(cred);
 					src_entry->cred = cred;
 					*fork_charge += size;
 				}
 			}
 			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				/*
 				 * MAP_ENTRY_VN_WRITECNT cannot
 				 * indicate write reference from
 				 * src_entry, since the entry is
 				 * marked as needs copy.  Allocate a
 				 * fake entry that is used to
 				 * decrement object->un_pager.vnp.writecount
 				 * at the appropriate time.  Attach
 				 * fake_entry to the deferred list.
 				 */
 				fake_entry = vm_map_entry_create(dst_map);
 				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
 				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
 				vm_object_reference(src_object);
 				fake_entry->object.vm_object = src_object;
 				fake_entry->start = src_entry->start;
 				fake_entry->end = src_entry->end;
 				fake_entry->next = curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 			if (src_entry->cred != NULL) {
 				dst_entry->cred = curthread->td_ucred;
 				crhold(dst_entry->cred);
 				*fork_charge += size;
 			}
 		}
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * We don't want to make writeable wired pages copy-on-write.
 		 * Immediately copy these pages into the new map by simulating
 		 * page faults.  The new pages are pageable.
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 {
 	struct vmspace *vm2;
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int locked;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
 	if (vm2 == NULL)
 		return (NULL);
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
 	new_map = &vm2->vm_map;
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = 0;
 				if (old_entry->cred != NULL) {
 					object->cred = old_entry->cred;
 					object->charge = old_entry->end -
 					    old_entry->start;
 					old_entry->cred = NULL;
 				}
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 				    &old_entry->offset,
 				    old_entry->end - old_entry->start);
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				/* Transfer the second reference too. */
 				vm_object_reference(
 				    old_entry->object.vm_object);
 
 				/*
 				 * As in vm_map_simplify_entry(), the
 				 * vnode lock will not be acquired in
 				 * this call to vm_object_deallocate().
 				 */
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
 			VM_OBJECT_WLOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			if (old_entry->cred != NULL) {
 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
 				object->cred = old_entry->cred;
 				object->charge = old_entry->end - old_entry->start;
 				old_entry->cred = NULL;
 			}
 
 			/*
 			 * Assert the correct state of the vnode
 			 * v_writecount while the object is locked, to
 			 * not relock it later for the assertion
 			 * correctness.
 			 */
 			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
 			    object->type == OBJT_VNODE) {
 				KASSERT(((struct vnode *)object->handle)->
 				    v_writecount > 0,
 				    ("vmspace_fork: v_writecount %p", object));
 				KASSERT(object->un_pager.vnp.writemappings > 0,
 				    ("vmspace_fork: vnp.writecount %p",
 				    object));
 			}
 			VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				vnode_pager_update_writecount(object,
 				    new_entry->start, new_entry->end);
 			}
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			/*
 			 * Copied entry is COW over the old object.
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 	/*
 	 * Use inlined vm_map_unlock() to postpone handling the deferred
 	 * map entries, which cannot be done until both old_map and
 	 * new_map locks are released.
 	 */
 	sx_xunlock(&old_map->lock);
 	sx_xunlock(&new_map->lock);
 	vm_map_process_deferred();
 
 	return (vm2);
 }
 
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_size_t growsize, init_ssize;
 	rlim_t lmemlim, vmemlim;
 	int rv;
 
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
 	PROC_LOCK(curproc);
 	lmemlim = lim_cur(curproc, RLIMIT_MEMLOCK);
 	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
 	PROC_UNLOCK(curproc);
 	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 		if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 	}
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
 	    max, cow);
 out:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 static int
 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, top;
 	vm_size_t init_ssize;
 	int orient, rv;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 * NOTE: We explicitly allow bi-directional stacks.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
 	KASSERT(orient != 0, ("No stack grow direction"));
 
 	if (addrbos < vm_map_min(map) ||
 	    addrbos > vm_map_max(map) ||
 	    addrbos + max_ssize < addrbos)
 		return (KERN_NO_SPACE);
 
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If we can't accomodate max_ssize in the current mapping, no go.
 	 * However, we need to be aware that subsequent user mappings might
 	 * map into the space we have reserved for stack, and currently this
 	 * space is not protected.
 	 *
 	 * Hopefully we will at least detect this condition when we try to
 	 * grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < addrbos + max_ssize))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN)
 		bot = addrbos + max_ssize - init_ssize;
 	else if (orient == MAP_STACK_GROWS_UP)
 		bot = addrbos;
 	else
 		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
 	top = bot + init_ssize;
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 
 	/* Now set the avail_ssize amount. */
 	if (rv == KERN_SUCCESS) {
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, bot);
 		new_entry = prev_entry->next;
 		if (new_entry->end != top || new_entry->start != bot)
 			panic("Bad entry start/end for new stack entry");
 
 		new_entry->avail_ssize = max_ssize - init_ssize;
 		if (orient & MAP_STACK_GROWS_DOWN)
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		if (orient & MAP_STACK_GROWS_UP)
 			new_entry->eflags |= MAP_ENTRY_GROWS_UP;
 	}
 
 	return (rv);
 }
 
 static int stack_guard_page = 0;
 TUNABLE_INT("security.bsd.stack_guard_page", &stack_guard_page);
 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RW,
     &stack_guard_page, 0,
     "Insert stack guard page ahead of the growable segments.");
 
 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
  * desired address is already mapped, or if we successfully grow
  * the stack.  Also returns KERN_SUCCESS if addr is outside the
  * stack range (this is strange, but preserves compatibility with
  * the grow function in vm_machdep.c).
  */
 int
 vm_map_growstack(struct proc *p, vm_offset_t addr)
 {
 	vm_map_entry_t next_entry, prev_entry;
 	vm_map_entry_t new_entry, stack_entry;
 	struct vmspace *vm = p->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t end;
 	vm_size_t growsize;
 	size_t grow_amount, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
 	int is_procstack, rv;
 	struct ucred *cred;
 #ifdef notyet
 	uint64_t limit;
 #endif
 #ifdef RACCT
 	int error;
 #endif
 
 Retry:
 	PROC_LOCK(p);
 	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(p, RLIMIT_STACK);
 	vmemlim = lim_cur(p, RLIMIT_VMEM);
 	PROC_UNLOCK(p);
 
 	vm_map_lock_read(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
 	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	}
 
 	next_entry = prev_entry->next;
 	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
 		/*
 		 * This entry does not grow upwards. Since the address lies
 		 * beyond this entry, the next entry (if one exists) has to
 		 * be a downward growable entry. The entry list header is
 		 * never a growable entry, so it suffices to check the flags.
 		 */
 		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
 			vm_map_unlock_read(map);
 			return (KERN_SUCCESS);
 		}
 		stack_entry = next_entry;
 	} else {
 		/*
 		 * This entry grows upward. If the next entry does not at
 		 * least grow downwards, this is the entry we need to grow.
 		 * otherwise we have two possible choices and we have to
 		 * select one.
 		 */
 		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
 			/*
 			 * We have two choices; grow the entry closest to
 			 * the address to minimize the amount of growth.
 			 */
 			if (addr - prev_entry->end <= next_entry->start - addr)
 				stack_entry = prev_entry;
 			else
 				stack_entry = next_entry;
 		} else
 			stack_entry = prev_entry;
 	}
 
 	if (stack_entry == next_entry) {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
 		KASSERT(addr < stack_entry->start, ("foo"));
 		end = (prev_entry != &map->header) ? prev_entry->end :
 		    stack_entry->start - stack_entry->avail_ssize;
 		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
 		max_grow = stack_entry->start - end;
 	} else {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
 		KASSERT(addr >= stack_entry->end, ("foo"));
 		end = (next_entry != &map->header) ? next_entry->start :
 		    stack_entry->end + stack_entry->avail_ssize;
 		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
 		max_grow = end - stack_entry->end;
 	}
 
 	if (grow_amount > stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If there is no longer enough space between the entries nogo, and
 	 * adjust the available space.  Note: this  should only happen if the
 	 * user has mapped into the stack area after the stack was created,
 	 * and is probably an error.
 	 *
 	 * This also effectively destroys any guard page the user might have
 	 * intended by limiting the stack size.
 	 */
 	if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
 		if (vm_map_lock_upgrade(map))
 			goto Retry;
 
 		stack_entry->avail_ssize = max_grow;
 
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 #ifdef RACCT
-	PROC_LOCK(p);
-	if (is_procstack &&
-	    racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) {
+	if (racct_enable) {
+		PROC_LOCK(p);
+		if (is_procstack && racct_set(p, RACCT_STACK,
+		    ctob(vm->vm_ssize) + grow_amount)) {
+			PROC_UNLOCK(p);
+			vm_map_unlock_read(map);
+			return (KERN_NO_SPACE);
+		}
 		PROC_UNLOCK(p);
-		vm_map_unlock_read(map);
-		return (KERN_NO_SPACE);
 	}
-	PROC_UNLOCK(p);
 #endif
 
 	/* Round up the grow amount modulo sgrowsiz */
 	growsize = sgrowsiz;
 	grow_amount = roundup(grow_amount, growsize);
 	if (grow_amount > stack_entry->avail_ssize)
 		grow_amount = stack_entry->avail_ssize;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
 	PROC_UNLOCK(p);
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
 	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
 			vm_map_unlock_read(map);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
-		PROC_LOCK(p);
-		if (racct_set(p, RACCT_MEMLOCK,
-		    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+		if (racct_enable) {
+			PROC_LOCK(p);
+			if (racct_set(p, RACCT_MEMLOCK,
+			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+				PROC_UNLOCK(p);
+				vm_map_unlock_read(map);
+				rv = KERN_NO_SPACE;
+				goto out;
+			}
 			PROC_UNLOCK(p);
-			vm_map_unlock_read(map);
-			rv = KERN_NO_SPACE;
-			goto out;
 		}
-		PROC_UNLOCK(p);
 #endif
 	}
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		vm_map_unlock_read(map);
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
-	PROC_LOCK(p);
-	if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+	if (racct_enable) {
+		PROC_LOCK(p);
+		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+			PROC_UNLOCK(p);
+			vm_map_unlock_read(map);
+			rv = KERN_NO_SPACE;
+			goto out;
+		}
 		PROC_UNLOCK(p);
-		vm_map_unlock_read(map);
-		rv = KERN_NO_SPACE;
-		goto out;
 	}
-	PROC_UNLOCK(p);
 #endif
 
 	if (vm_map_lock_upgrade(map))
 		goto Retry;
 
 	if (stack_entry == next_entry) {
 		/*
 		 * Growing downward.
 		 */
 		/* Get the preliminary new entry start value */
 		addr = stack_entry->start - grow_amount;
 
 		/*
 		 * If this puts us into the previous entry, cut back our
 		 * growth to the available space. Also, see the note above.
 		 */
 		if (addr < end) {
 			stack_entry->avail_ssize = max_grow;
 			addr = end;
 			if (stack_guard_page)
 				addr += PAGE_SIZE;
 		}
 
 		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
 		    next_entry->protection, next_entry->max_protection, 0);
 
 		/* Adjust the available stack space by the amount we grew. */
 		if (rv == KERN_SUCCESS) {
 			if (prev_entry != &map->header)
 				vm_map_clip_end(map, prev_entry, addr);
 			new_entry = prev_entry->next;
 			KASSERT(new_entry == stack_entry->prev, ("foo"));
 			KASSERT(new_entry->end == stack_entry->start, ("foo"));
 			KASSERT(new_entry->start == addr, ("foo"));
 			grow_amount = new_entry->end - new_entry->start;
 			new_entry->avail_ssize = stack_entry->avail_ssize -
 			    grow_amount;
 			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		}
 	} else {
 		/*
 		 * Growing upward.
 		 */
 		addr = stack_entry->end + grow_amount;
 
 		/*
 		 * If this puts us into the next entry, cut back our growth
 		 * to the available space. Also, see the note above.
 		 */
 		if (addr > end) {
 			stack_entry->avail_ssize = end - stack_entry->end;
 			addr = end;
 			if (stack_guard_page)
 				addr -= PAGE_SIZE;
 		}
 
 		grow_amount = addr - stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
 			 vm_object_coalesce(stack_entry->object.vm_object,
 			 stack_entry->offset,
 			 (vm_size_t)(stack_entry->end - stack_entry->start),
 			 (vm_size_t)grow_amount, cred != NULL)) {
 			map->size += (addr - stack_entry->end);
 			/* Update the current entry. */
 			stack_entry->end = addr;
 			stack_entry->avail_ssize -= grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
 
 			if (next_entry != &map->header)
 				vm_map_clip_start(map, next_entry, addr);
 		} else
 			rv = KERN_FAILURE;
 	}
 
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	vm_map_unlock(map);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
 		vm_map_wire(map,
 		    (stack_entry == next_entry) ? addr : addr - grow_amount,
 		    (stack_entry == next_entry) ? stack_entry->start : addr,
 		    (p->p_flag & P_SYSTEM)
 		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
 		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	}
 
 out:
 #ifdef RACCT
-	if (rv != KERN_SUCCESS) {
+	if (racct_enable && rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
 		if (!old_mlock) {
 			error = racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)));
 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
 		}
 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
 	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	curthread->td_pflags |= TDP_EXECVMSPC;
 	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_ooffset_t fork_charge;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return (0);
 	fork_charge = 0;
 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
 		vmspace_free(newvmspace);
 		return (ENOMEM);
 	}
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 	vm_object_t eobject;
 	vm_size_t size;
 	struct ucred *cred;
 
 RetryLookup:;
 
 	vm_map_lock_read(map);
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	entry = *out_entry;
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE)) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	if ((fault_typea & VM_PROT_COPY) != 0 &&
 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 	size = entry->end - entry->start;
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if ((fault_type & VM_PROT_WRITE) != 0 ||
 		    (fault_typea & VM_PROT_COPY) != 0) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			if (entry->cred == NULL) {
 				/*
 				 * The debugger owner is charged for
 				 * the memory.
 				 */
 				cred = curthread->td_ucred;
 				crhold(cred);
 				if (!swap_reserve_by_cred(size, cred)) {
 					crfree(cred);
 					vm_map_unlock(map);
 					return (KERN_RESOURCE_SHORTAGE);
 				}
 				entry->cred = cred;
 			}
 			vm_object_shadow(&entry->object.vm_object,
 			    &entry->offset, size);
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 			eobject = entry->object.vm_object;
 			if (eobject->cred != NULL) {
 				/*
 				 * The object was not shadowed.
 				 */
 				swap_release_by_cred(size, entry->cred);
 				crfree(entry->cred);
 				entry->cred = NULL;
 			} else if (entry->cred != NULL) {
 				VM_OBJECT_WLOCK(eobject);
 				eobject->cred = entry->cred;
 				eobject->charge = size;
 				VM_OBJECT_WUNLOCK(eobject);
 				entry->cred = NULL;
 			}
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(size));
 		entry->offset = 0;
 		if (entry->cred != NULL) {
 			VM_OBJECT_WLOCK(entry->object.vm_object);
 			entry->object.vm_object->cred = entry->cred;
 			entry->object.vm_object->charge = size;
 			VM_OBJECT_WUNLOCK(entry->object.vm_object);
 			entry->cred = NULL;
 		}
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
 		return (KERN_INVALID_ADDRESS);
 
 	entry = *out_entry;
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE))
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 static void
 vm_map_print(vm_map_t map)
 {
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p\n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end);
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
 			if (entry->cred != NULL)
 				db_printf(", ruid %d", entry->cred->cr_ruid);
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->object.vm_object && entry->object.vm_object->cred)
 				db_printf(", obj ruid %d charge %jx",
 				    entry->object.vm_object->cred->cr_ruid,
 				    (uintmax_t)entry->object.vm_object->charge);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						0, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 }
 
 DB_SHOW_COMMAND(map, map)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show map <addr>\n");
 		return;
 	}
 	vm_map_print((vm_map_t)addr);
 }
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = (struct proc *) addr;
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */
Index: stable/10/sys/vm/vm_mmap.c
===================================================================
--- stable/10/sys/vm/vm_mmap.c	(revision 284664)
+++ stable/10/sys/vm/vm_mmap.c	(revision 284665)
@@ -1,1696 +1,1700 @@
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 int old_mlock = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
 TUNABLE_INT("vm.old_mlock", &old_mlock);
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_sbrk(td, uap)
 	struct thread *td;
 	struct sbrk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_sstk(td, uap)
 	struct thread *td;
 	struct sstk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 int
 ogetpagesize(td, uap)
 	struct thread *td;
 	struct getpagesize_args *uap;
 {
 	/* MP SAFE */
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 */
 
 
 /*
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mmap(td, uap)
 	struct thread *td;
 	struct mmap_args *uap;
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct file *fp;
 	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t cap_maxprot, prot, maxprot;
 	void *handle;
 	objtype_t handle_type;
 	int align, error, flags;
 	off_t pos;
 	struct vmspace *vms = td->td_proc->p_vmspace;
 	cap_rights_t rights;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 	flags = uap->flags;
 	pos = uap->pos;
 
 	fp = NULL;
 
 	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
 	 * zero position for new code. Be nice to ancient a.out
 	 * binaries and correct pos for anonymous mapping, since old
 	 * ld.so sometimes issues anonymous map requests with non-zero
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
 		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
 		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
 			pos = 0;
 	}
 
 	if (flags & MAP_STACK) {
 		if ((uap->fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Adjust size for rounding (on both ends). */
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
 	/* Ensure alignment is at least a page and fits in a pointer. */
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 
 		/* Address range must be all in user VM space. */
 		if (addr < vm_map_min(&vms->vm_map) ||
 		    addr + size > vm_map_max(&vms->vm_map))
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
 #ifdef MAP_32BIT
 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
 			return (EINVAL);
 	} else if (flags & MAP_32BIT) {
 		/*
 		 * For MAP_32BIT, override the hint if it is too high and
 		 * do not bother moving the mapping past the heap (since
 		 * the heap is usually above 2GB).
 		 */
 		if (addr + size > MAP_32BIT_MAX_ADDR)
 			addr = 0;
 #endif
 	} else {
 		/*
 		 * XXX for non-fixed mappings where no hint is provided or
 		 * the hint would fall in the potential heap space,
 		 * place it after the end of the largest possible heap.
 		 *
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
 		PROC_LOCK(td->td_proc);
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td->td_proc, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
 			    lim_max(td->td_proc, RLIMIT_DATA));
 		PROC_UNLOCK(td->td_proc);
 	}
 	if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
 		handle = NULL;
 		handle_type = OBJT_DEFAULT;
 		maxprot = VM_PROT_ALL;
 		cap_maxprot = VM_PROT_ALL;
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
 		 * descriptor disappear on us if we block. Check capability
 		 * rights, but also return the maximum rights to be combined
 		 * with maxprot later.
 		 */
 		cap_rights_init(&rights, CAP_MMAP);
 		if (prot & PROT_READ)
 			cap_rights_set(&rights, CAP_MMAP_R);
 		if ((flags & MAP_SHARED) != 0) {
 			if (prot & PROT_WRITE)
 				cap_rights_set(&rights, CAP_MMAP_W);
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set(&rights, CAP_MMAP_X);
 		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
 		if (fp->f_type == DTYPE_SHM) {
 			handle = fp->f_data;
 			handle_type = OBJT_SWAP;
 			maxprot = VM_PROT_NONE;
 
 			/* FREAD should always be set. */
 			if (fp->f_flag & FREAD)
 				maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 			if (fp->f_flag & FWRITE)
 				maxprot |= VM_PROT_WRITE;
 			goto map;
 		}
 		if (fp->f_type != DTYPE_VNODE) {
 			error = ENODEV;
 			goto done;
 		}
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 		/*
 		 * POSIX shared-memory objects are defined to have
 		 * kernel persistence, and are not defined to support
 		 * read(2)/write(2) -- or even open(2).  Thus, we can
 		 * use MAP_ASYNC to trade on-disk coherence for speed.
 		 * The shm_open(3) library routine turns on the FPOSIXSHM
 		 * flag to request this behavior.
 		 */
 		if (fp->f_flag & FPOSIXSHM)
 			flags |= MAP_NOSYNC;
 #endif
 		vp = fp->f_vnode;
 		/*
 		 * Ensure that file and memory protections are
 		 * compatible.  Note that we only worry about
 		 * writability if mapping is shared; in this case,
 		 * current and max prot are dictated by the open file.
 		 * XXX use the vnode instead?  Problem is: what
 		 * credentials do we use for determination? What if
 		 * proc does a setuid?
 		 */
 		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
 			maxprot = VM_PROT_NONE;
 		else
 			maxprot = VM_PROT_EXECUTE;
 		if (fp->f_flag & FREAD) {
 			maxprot |= VM_PROT_READ;
 		} else if (prot & PROT_READ) {
 			error = EACCES;
 			goto done;
 		}
 		/*
 		 * If we are sharing potential changes (either via
 		 * MAP_SHARED or via the implicit sharing of character
 		 * device mappings), and we are trying to get write
 		 * permission although we opened it without asking
 		 * for it, bail out.
 		 */
 		if ((flags & MAP_SHARED) != 0) {
 			if ((fp->f_flag & FWRITE) != 0) {
 				maxprot |= VM_PROT_WRITE;
 			} else if ((prot & PROT_WRITE) != 0) {
 				error = EACCES;
 				goto done;
 			}
 		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
 			maxprot |= VM_PROT_WRITE;
 			cap_maxprot |= VM_PROT_WRITE;
 		}
 		handle = (void *)vp;
 		handle_type = OBJT_VNODE;
 	}
 map:
 	td->td_fpop = fp;
 	maxprot &= cap_maxprot;
 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
 	    flags, handle_type, handle, pos);
 	td->td_fpop = NULL;
 #ifdef HWPMC_HOOKS
 	/* inform hwpmc(4) if an executable is being mapped */
 	if (error == 0 && handle_type == OBJT_VNODE &&
 	    (prot & PROT_EXEC)) {
 		pkm.pm_file = handle;
 		pkm.pm_address = (uintptr_t) addr;
 		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
 	}
 #endif
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
 	if (fp)
 		fdrop(fp, td);
 
 	return (error);
 }
 
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
 	struct mmap_args oargs;
 
 	oargs.addr = uap->addr;
 	oargs.len = uap->len;
 	oargs.prot = uap->prot;
 	oargs.flags = uap->flags;
 	oargs.fd = uap->fd;
 	oargs.pos = uap->pos;
 	return (sys_mmap(td, &oargs));
 }
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(td, uap)
 	struct thread *td;
 	struct ommap_args *uap;
 {
 	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
 	nargs.addr = uap->addr;
 	nargs.len = uap->len;
 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
 #ifdef COMPAT_FREEBSD32
 #if defined(__amd64__) || defined(__ia64__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
 	    nargs.prot != 0)
 		nargs.prot |= PROT_EXEC;
 #endif
 #endif
 	nargs.flags = 0;
 	if (uap->flags & OMAP_ANON)
 		nargs.flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		nargs.flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		nargs.flags |= MAP_SHARED;
 	else
 		nargs.flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		nargs.flags |= MAP_FIXED;
 	nargs.fd = uap->fd;
 	nargs.pos = uap->pos;
 	return (sys_mmap(td, &nargs));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	size_t len;
 	int flags;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_msync(td, uap)
 	struct thread *td;
 	struct msync_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int flags;
 	vm_map_t map;
 	int rv;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	flags = uap->flags;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 		return (ENOMEM);
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_munmap(td, uap)
 	struct thread *td;
 	struct munmap_args *uap;
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
 #endif
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_map_t map;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap...
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
 		return (EINVAL);
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
 	/*
 	 * Inform hwpmc if the address range being unmapped contains
 	 * an executable region.
 	 */
 	pkm.pm_address = (uintptr_t) NULL;
 	if (vm_map_lookup_entry(map, addr, &entry)) {
 		for (;
 		     entry != &map->header && entry->start < addr + size;
 		     entry = entry->next) {
 			if (vm_map_check_protection(map, entry->start,
 				entry->end, VM_PROT_EXECUTE) == TRUE) {
 				pkm.pm_address = (uintptr_t) addr;
 				pkm.pm_size = (size_t) size;
 				break;
 			}
 		}
 	}
 #endif
 	vm_map_delete(map, addr, addr + size);
 
 #ifdef HWPMC_HOOKS
 	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
 	vm_map_lock_downgrade(map);
 	if (pkm.pm_address != (uintptr_t) NULL)
 		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
 	vm_map_unlock_read(map);
 #else
 	vm_map_unlock(map);
 #endif
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_mprotect(td, uap)
 	struct thread *td;
 	struct mprotect_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, prot, FALSE)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	case KERN_RESOURCE_SHORTAGE:
 		return (ENOMEM);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_minherit(td, uap)
 	struct thread *td;
 	struct minherit_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	inherit = uap->inherit;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, inherit)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_madvise(td, uap)
 	struct thread *td;
 	struct madvise_args *uap;
 {
 	vm_offset_t start, end;
 	vm_map_t map;
 	int flags;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
 	if (uap->behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
 	}
 
 	/*
 	 * Check for illegal behavior
 	 */
 	if (uap->behav < 0 || uap->behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
 		return (EINVAL);
 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
 	start = trunc_page((vm_offset_t) uap->addr);
 	end = round_page((vm_offset_t) uap->addr + uap->len);
 
 	if (vm_map_madvise(map, start, end, uap->behav))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mincore(td, uap)
 	struct thread *td;
 	struct mincore_args *uap;
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
 	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_paddr_t locked_pa;
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int mincoreinfo;
 	unsigned int timestamp;
 	boolean_t locked;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + (vm_size_t)round_page(uap->len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
 	/*
 	 * Address of byte vector
 	 */
 	vec = uap->vec;
 
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry)) {
 		vm_map_unlock_read(map);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	for (current = entry;
 	    (current != &map->header) && (current->start < end);
 	    current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
 		if (current->end < end &&
 		    (entry->next == &map->header ||
 		     current->next->start > current->end)) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			current->object.vm_object == NULL)
 			continue;
 
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		/*
 		 * scan this entry one page at a time
 		 */
 		while (addr < cend) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			object = NULL;
 			locked_pa = 0;
 		retry:
 			m = NULL;
 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
 			if (locked_pa != 0) {
 				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
 				 * other mappings might be examined.
 				 */
 				m = PHYS_TO_VM_PAGE(locked_pa);
 				if (m->object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = m->object;
 					locked = VM_OBJECT_TRYWLOCK(object);
 					vm_page_unlock(m);
 					if (!locked) {
 						VM_OBJECT_WLOCK(object);
 						vm_page_lock(m);
 						goto retry;
 					}
 				} else
 					vm_page_unlock(m);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("mincore: page %p is mapped but invalid",
 				    m));
 			} else if (mincoreinfo == 0) {
 				/*
 				 * The page is not mapped by this process.  If
 				 * the object implements managed pages, then
 				 * determine if the page is resident so that
 				 * the mappings might be examined.
 				 */
 				if (current->object.vm_object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = current->object.vm_object;
 					VM_OBJECT_WLOCK(object);
 				}
 				if (object->type == OBJT_DEFAULT ||
 				    object->type == OBJT_SWAP ||
 				    object->type == OBJT_VNODE) {
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
 					if (m == NULL &&
 					    vm_page_is_cached(object, pindex))
 						mincoreinfo = MINCORE_INCORE;
 					if (m != NULL && m->valid == 0)
 						m = NULL;
 					if (m != NULL)
 						mincoreinfo = MINCORE_INCORE;
 				}
 			}
 			if (m != NULL) {
 				/* Examine other mappings to the page. */
 				if (m->dirty == 0 && pmap_is_modified(m))
 					vm_page_dirty(m);
 				if (m->dirty != 0)
 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
 				/*
 				 * The first test for PGA_REFERENCED is an
 				 * optimization.  The second test is
 				 * required because a concurrent pmap
 				 * operation could clear the last reference
 				 * and set PGA_REFERENCED before the call to
 				 * pmap_is_referenced(). 
 				 */
 				if ((m->aflags & PGA_REFERENCED) != 0 ||
 				    pmap_is_referenced(m) ||
 				    (m->aflags & PGA_REFERENCED) != 0)
 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
 			}
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = OFF_TO_IDX(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				++lastvecindex;
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte(vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 			addr += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 	}
 
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_mlock(td, uap)
 	struct thread *td;
 	struct mlock_args *uap;
 {
 
 	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
 }
 
 int
 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	vm_map_t map;
 	unsigned long nsize;
 	int error;
 
 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
 	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
 	if (npages + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
-	PROC_LOCK(proc);
-	error = racct_set(proc, RACCT_MEMLOCK, nsize);
-	PROC_UNLOCK(proc);
-	if (error != 0)
-		return (ENOMEM);
+	if (racct_enable) {
+		PROC_LOCK(proc);
+		error = racct_set(proc, RACCT_MEMLOCK, nsize);
+		PROC_UNLOCK(proc);
+		if (error != 0)
+			return (ENOMEM);
+	}
 #endif
 	error = vm_map_wire(map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
-	if (error != KERN_SUCCESS) {
+	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(proc);
 		racct_set(proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_mlockall(td, uap)
 	struct thread *td;
 	struct mlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 
 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 		return (EINVAL);
 
 	/*
 	 * If wiring all pages in the process would cause it to exceed
 	 * a hard resource limit, return ENOMEM.
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		PROC_LOCK(td->td_proc);
 		if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
-	PROC_UNLOCK(td->td_proc);
-	if (error != 0)
-		return (ENOMEM);
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
+		PROC_UNLOCK(td->td_proc);
+		if (error != 0)
+			return (ENOMEM);
+	}
 #endif
 
 	if (uap->how & MCL_FUTURE) {
 		vm_map_lock(map);
 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
 		vm_map_unlock(map);
 		error = 0;
 	}
 
 	if (uap->how & MCL_CURRENT) {
 		/*
 		 * P1003.1-2001 mandates that all currently mapped pages
 		 * will be memory resident and locked (wired) upon return
 		 * from mlockall(). vm_map_wire() will wire pages, by
 		 * calling vm_fault_wire() for each page in the region.
 		 */
 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 	}
 #ifdef RACCT
-	if (error != KERN_SUCCESS) {
+	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlockall_args {
 	register_t dummy;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sys_munlockall(td, uap)
 	struct thread *td;
 	struct munlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
 	vm_map_lock(map);
 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
 	vm_map_unlock(map);
 
 	/* Forcibly unwire all pages. */
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 #ifdef RACCT
-	if (error == KERN_SUCCESS) {
+	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sys_munlock(td, uap)
 	struct thread *td;
 	struct munlock_args *uap;
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
 #ifdef RACCT
 	vm_map_t map;
 #endif
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
-	if (error == KERN_SUCCESS) {
+	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		map = &td->td_proc->p_vmspace->vm_map;
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap_vnode()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
  *
  * For VCHR vnodes, the vnode lock is held over the call to
  * vm_mmap_cdev() to keep vp->v_rdev valid.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
     boolean_t *writecounted)
 {
 	struct vattr va;
 	vm_object_t obj;
 	vm_offset_t foff;
 	struct ucred *cred;
 	int error, flags, locktype;
 
 	cred = td->td_ucred;
 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 		locktype = LK_EXCLUSIVE;
 	else
 		locktype = LK_SHARED;
 	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
 	if (vp->v_type == VREG) {
 		/*
 		 * Get the proper underlying object
 		 */
 		if (obj == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode *)obj->handle;
 			/*
 			 * Bypass filesystems obey the mpsafety of the
 			 * underlying fs.  Tmpfs never bypasses.
 			 */
 			error = vget(vp, locktype, td);
 			if (error != 0)
 				return (error);
 		}
 		if (locktype == LK_EXCLUSIVE) {
 			*writecounted = TRUE;
 			vnode_pager_update_writecount(obj, 0, objsize);
 		}
 	} else if (vp->v_type == VCHR) {
 		error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
 		    vp->v_rdev, foffp, objp);
 		if (error == 0)
 			goto mark_atime;
 		goto done;
 	} else {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
 	error = mac_vnode_check_mmap(cred, vp, prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 			if (prot & PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
 			*maxprotp &= ~VM_PROT_WRITE;
 		}
 	}
 	/*
 	 * If it is a regular file without any references
 	 * we do not need to sync it.
 	 * Adjust object size to be the size of actual file.
 	 */
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
 	if (obj->type == OBJT_VNODE)
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
 	else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
 		vm_object_reference(obj);
 	}
 	if (obj == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 	*objp = obj;
 	*flagsp = flags;
 
 mark_atime:
 	vfs_mark_atime(vp, cred);
 
 done:
 	if (error != 0 && *writecounted) {
 		*writecounted = FALSE;
 		vnode_pager_update_writecount(obj, objsize, 0);
 	}
 	vput(vp);
 	return (error);
 }
 
 /*
  * vm_mmap_cdev()
  *
  * MPSAFE
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
 int
 vm_mmap_cdev(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
 	struct cdevsw *dsw;
 	int error, flags, ref;
 
 	flags = *flagsp;
 
 	dsw = dev_refthread(cdev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (dsw->d_flags & D_MMAP_ANON) {
 		dev_relthread(cdev, ref);
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
 	}
 	/*
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & PROT_WRITE) != 0) {
 		dev_relthread(cdev, ref);
 		return (EACCES);
 	}
 	if (flags & (MAP_PRIVATE|MAP_COPY)) {
 		dev_relthread(cdev, ref);
 		return (EINVAL);
 	}
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
 	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
 	if (error != 0) {
 		dev_relthread(cdev, ref);
 		return (error);
 	}
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
 	 * (returns ENODEV), fall back to using the device pager.
 	 * Note that d_mmap_single() must return a reference to the
 	 * object (it needs to bump the reference count of the object
 	 * it returns somehow).
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 	dev_relthread(cdev, ref);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 	    td->td_ucred);
 	if (obj == NULL)
 		return (EINVAL);
 	*objp = obj;
 	*flagsp = flags;
 	return (0);
 }
 
 /*
  * vm_mmap_shm()
  *
  * MPSAFE
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on shm file descriptors.
  */
 int
 vm_mmap_shm(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
 {
 	int error;
 
 	if ((*flagsp & MAP_SHARED) != 0 &&
 	    (*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & PROT_WRITE) != 0)
 		return (EACCES);
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
 	if (error != 0)
 		return (error);
 #endif
 	error = shm_mmap(shmfd, objsize, foff, objp);
 	if (error)
 		return (error);
 	return (0);
 }
 
 /*
  * vm_mmap()
  *
  * MPSAFE
  *
  * Internal version of mmap.  Currently used by mmap, exec, and sys5
  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
  */
 int
 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	vm_prot_t maxprot, int flags,
 	objtype_t handle_type, void *handle,
 	vm_ooffset_t foff)
 {
 	boolean_t fitit;
 	vm_object_t object = NULL;
 	struct thread *td = curthread;
 	int docow, error, findspace, rv;
 	boolean_t writecounted;
 
 	if (size == 0)
 		return (0);
 
 	size = round_page(size);
 
 	if (map == &td->td_proc->p_vmspace->vm_map) {
 		PROC_LOCK(td->td_proc);
 		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
 			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (ENOMEM);
 			}
 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + size);
 			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (error);
 			}
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The check is here rather than in the syscall because the
 	 * kernel calls this function internally for other mmaping
 	 * operations (such as in exec) and non-aligned offsets will
 	 * cause pmap inconsistencies...so we want to be sure to
 	 * disallow this in all cases.
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 	}
 	writecounted = FALSE;
 
 	/*
 	 * Lookup/allocate object.
 	 */
 	switch (handle_type) {
 	case OBJT_DEVICE:
 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
 		    handle, &foff, &object);
 		break;
 	case OBJT_VNODE:
 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 		    handle, &foff, &object, &writecounted);
 		break;
 	case OBJT_SWAP:
 		error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
 		    handle, foff, &object);
 		break;
 	case OBJT_DEFAULT:
 		if (handle == NULL) {
 			error = 0;
 			break;
 		}
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 	if (flags & MAP_ANON) {
 		object = NULL;
 		docow = 0;
 		/*
 		 * Unnamed anonymous regions always start at 0.
 		 */
 		if (handle == 0)
 			foff = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
 		docow = MAP_PREFAULT_PARTIAL;
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 	/* Shared memory is also shared with children. */
 	if (flags & MAP_SHARED)
 		docow |= MAP_INHERIT_SHARE;
 	if (writecounted)
 		docow |= MAP_VN_WRITECOUNT;
 	if (flags & MAP_STACK) {
 		if (object != NULL)
 			return (EINVAL);
 		docow |= MAP_STACK_GROWS_DOWN;
 	}
 	if ((flags & MAP_EXCL) != 0)
 		docow |= MAP_CHECK_EXCL;
 
 	if (fitit) {
 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 			findspace = VMFS_SUPER_SPACE;
 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 			findspace = VMFS_ALIGNED_SPACE(flags >>
 			    MAP_ALIGNMENT_SHIFT);
 		else
 			findspace = VMFS_OPTIMAL_SPACE;
 		rv = vm_map_find(map, object, foff, addr, size,
 #ifdef MAP_32BIT
 		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
 #endif
 		    0, findspace, prot, maxprot, docow);
 	} else {
 		rv = vm_map_fixed(map, object, foff, *addr, size,
 		    prot, maxprot, docow);
 	}
 
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * If the process has requested that all future mappings
 		 * be wired, then heed this.
 		 */
 		if (map->flags & MAP_WIREFUTURE) {
 			vm_map_wire(map, *addr, *addr + size,
 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 		}
 	} else {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vnode_pager_release_writecount(object, 0, size);
 		/*
 		 * Lose the object reference.  Will destroy the
 		 * object if it's an unnamed anonymous mapping
 		 * or named anonymous without other references.
 		 */
 		vm_object_deallocate(object);
 	}
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * Translate a Mach VM return code to zero on success or the appropriate errno
  * on failure.
  */
 int
 vm_mmap_to_errno(int rv)
 {
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }
Index: stable/10/sys/vm/vm_pageout.c
===================================================================
--- stable/10/sys/vm/vm_pageout.c	(revision 284664)
+++ stable/10/sys/vm/vm_pageout.c	(revision 284665)
@@ -1,1912 +1,1921 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kdtrace.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
 
 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
     NULL);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 #endif
 
 
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 int vm_pageout_wakeup_thresh;
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 static struct mtx vm_daemon_mtx;
 /* Allow for use by vm_pageout before vm_daemon is initialized. */
 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
 #endif
 static int vm_max_launder = 32;
 static int vm_pageout_update_period;
 static int defer_swap_pageouts;
 static int disable_swap_pageouts;
 static int lowmem_period = 10;
 static int lowmem_ticks;
 
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled = 0;
 static int vm_swap_idle_enabled = 0;
 #else
 static int vm_swap_enabled = 1;
 static int vm_swap_idle_enabled = 0;
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
 	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
 	"free page threshold for waking up the pageout daemon");
 
 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
 	CTLFLAG_RW, &vm_pageout_update_period, 0,
 	"Maximum active LRU update period");
   
 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
 	"Low memory callback period");
 
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #else
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
 static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
     vm_paddr_t);
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(int req);
 #endif
 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
 
 /*
  * Initialize a dummy page for marking the caller's place in the specified
  * paging queue.  In principle, this function only needs to set the flag
  * PG_MARKER.  Nonetheless, it wirte busies and initializes the hold count
  * to one as safety precautions.
  */ 
 static void
 vm_pageout_init_marker(vm_page_t marker, u_short queue)
 {
 
 	bzero(marker, sizeof(*marker));
 	marker->flags = PG_MARKER;
 	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	marker->queue = queue;
 	marker->hold_count = 1;
 }
 
 /*
  * vm_pageout_fallback_object_lock:
  * 
  * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
  * known to have failed and page queue must be either PQ_ACTIVE or
  * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
  * while locking the vm object.  Use marker page to detect page queue
  * changes and maintain notion of next page on page queue.  Return
  * TRUE if no changes were detected, FALSE otherwise.  vm object is
  * locked on return.
  * 
  * This function depends on both the lock portion of struct vm_object
  * and normal struct vm_page being type stable.
  */
 static boolean_t
 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
 	struct vm_pagequeue *pq;
 	boolean_t unchanged;
 	u_short queue;
 	vm_object_t object;
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
 	pq = vm_page_pagequeue(m);
 	object = m->object;
 	
 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
 	vm_pagequeue_unlock(pq);
 	vm_page_unlock(m);
 	VM_OBJECT_WLOCK(object);
 	vm_page_lock(m);
 	vm_pagequeue_lock(pq);
 
 	/* Page queue might have changed. */
 	*next = TAILQ_NEXT(&marker, plinks.q);
 	unchanged = (m->queue == queue &&
 		     m->object == object &&
 		     &marker == TAILQ_NEXT(m, plinks.q));
 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
 	return (unchanged);
 }
 
 /*
  * Lock the page while holding the page queue lock.  Use marker page
  * to detect page queue changes and maintain notion of next page on
  * page queue.  Return TRUE if no changes were detected, FALSE
  * otherwise.  The page is locked on return. The page queue lock might
  * be dropped and reacquired.
  *
  * This function depends on normal struct vm_page being type stable.
  */
 static boolean_t
 vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
 	struct vm_pagequeue *pq;
 	boolean_t unchanged;
 	u_short queue;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	if (vm_page_trylock(m))
 		return (TRUE);
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
 	pq = vm_page_pagequeue(m);
 
 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
 	vm_pagequeue_unlock(pq);
 	vm_page_lock(m);
 	vm_pagequeue_lock(pq);
 
 	/* Page queue might have changed. */
 	*next = TAILQ_NEXT(&marker, plinks.q);
 	unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, plinks.q));
 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
 	return (unchanged);
 }
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
 static int
 vm_pageout_clean(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
 	int pageout_count;
 	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 	 * with the new swapper, but we could have serious problems paging
 	 * out other object types if there is insufficient memory.  
 	 *
 	 * Unfortunately, checking free memory here is far too late, so the
 	 * check has been moved up a procedural level.
 	 */
 
 	/*
 	 * Can't clean the page if it's busy or held.
 	 */
 	vm_page_assert_unbusied(m);
 	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
 	vm_page_unlock(m);
 
 	mc[vm_pageout_page_count] = pb = ps = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying
 	 * align the clusters (which leave sporatic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 more:
 	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;
 
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 
 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
 			ib = 0;
 			break;
 		}
 		vm_page_lock(p);
 		vm_page_test_dirty(p);
 		if (p->dirty == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			ib = 0;
 			break;
 		}
 		vm_page_unlock(p);
 		mc[--page_base] = pb = p;
 		++pageout_count;
 		++ib;
 		/*
 		 * alignment boundry, stop here and switch directions.  Do
 		 * not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		vm_page_t p;
 
 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
 		vm_page_lock(p);
 		vm_page_test_dirty(p);
 		if (p->dirty == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			break;
 		}
 		vm_page_unlock(p);
 		mc[page_base + pageout_count] = ps = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past a page boundry.  This catches boundry
 	 * conditions.
 	 */
 	if (ib && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
 	    NULL));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  *
  *	Returned runlen is the count of pages between mreq and first
  *	page after mreq with status VM_PAGER_AGAIN.
  *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  *	for any page in runlen set.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
     boolean_t *eio)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i, runlen;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
 	 * mark the pages read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		vm_page_sbusy(mc[i]);
 		pmap_remove_write(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
 
 	runlen = count - mreq;
 	if (eio != NULL)
 		*eio = FALSE;
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
 		    !pmap_page_is_write_mapped(mt),
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
 			vm_page_lock(mt);
 			vm_page_activate(mt);
 			vm_page_unlock(mt);
 			if (eio != NULL && i >= mreq && i - mreq < runlen)
 				*eio = TRUE;
 			break;
 		case VM_PAGER_AGAIN:
 			if (i >= mreq && i - mreq < runlen)
 				runlen = i - mreq;
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
 			if (vm_page_count_severe()) {
 				vm_page_lock(mt);
 				vm_page_try_to_cache(mt);
 				vm_page_unlock(mt);
 			}
 		}
 	}
 	if (prunlen != NULL)
 		*prunlen = runlen;
 	return (numpagedout);
 }
 
 static boolean_t
 vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
     vm_paddr_t high)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_tmp, next;
 	int lockmode;
 
 	vm_pagequeue_lock(pq);
 	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 		pa = VM_PAGE_TO_PHYS(m);
 		if (pa < low || pa + PAGE_SIZE > high)
 			continue;
 		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
 			vm_page_unlock(m);
 			continue;
 		}
 		object = m->object;
 		if ((!VM_OBJECT_TRYWLOCK(object) &&
 		    (!vm_pageout_fallback_object_lock(m, &next) ||
 		    m->hold_count != 0)) || vm_page_busied(m)) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 			continue;
 		}
 		vm_page_test_dirty(m);
 		if (m->dirty == 0 && object->ref_count != 0)
 			pmap_remove_all(m);
 		if (m->dirty != 0) {
 			vm_page_unlock(m);
 			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
 				VM_OBJECT_WUNLOCK(object);
 				continue;
 			}
 			if (object->type == OBJT_VNODE) {
 				vm_pagequeue_unlock(pq);
 				vp = object->handle;
 				vm_object_reference_locked(object);
 				VM_OBJECT_WUNLOCK(object);
 				(void)vn_start_write(vp, &mp, V_WAIT);
 				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
 				    LK_SHARED : LK_EXCLUSIVE;
 				vn_lock(vp, lockmode | LK_RETRY);
 				VM_OBJECT_WLOCK(object);
 				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 				VM_OBJECT_WUNLOCK(object);
 				VOP_UNLOCK(vp, 0);
 				vm_object_deallocate(object);
 				vn_finished_write(mp);
 				return (TRUE);
 			} else if (object->type == OBJT_SWAP ||
 			    object->type == OBJT_DEFAULT) {
 				vm_pagequeue_unlock(pq);
 				m_tmp = m;
 				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
 				    0, NULL, NULL);
 				VM_OBJECT_WUNLOCK(object);
 				return (TRUE);
 			}
 		} else {
 			/*
 			 * Dequeue here to prevent lock recursion in
 			 * vm_page_cache().
 			 */
 			vm_page_dequeue_locked(m);
 			vm_page_cache(m);
 			vm_page_unlock(m);
 		}
 		VM_OBJECT_WUNLOCK(object);
 	}
 	vm_pagequeue_unlock(pq);
 	return (FALSE);
 }
 
 /*
  * Increase the number of cached pages.  The specified value, "tries",
  * determines which categories of pages are cached:
  *
  *  0: All clean, inactive pages within the specified physical address range
  *     are cached.  Will not sleep.
  *  1: The vm_lowmem handlers are called.  All inactive pages within
  *     the specified physical address range are cached.  May sleep.
  *  2: The vm_lowmem handlers are called.  All inactive and active pages
  *     within the specified physical address range are cached.  May sleep.
  */
 void
 vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
 {
 	int actl, actmax, inactl, inactmax, dom, initial_dom;
 	static int start_dom = 0;
 
 	if (tries > 0) {
 		/*
 		 * Decrease registered cache sizes.  The vm_lowmem handlers
 		 * may acquire locks and/or sleep, so they can only be invoked
 		 * when "tries" is greater than zero.
 		 */
 		SDT_PROBE0(vm, , , vm__lowmem_cache);
 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
 
 		/*
 		 * We do this explicitly after the caches have been drained
 		 * above.
 		 */
 		uma_reclaim();
 	}
 
 	/*
 	 * Make the next scan start on the next domain.
 	 */
 	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
 
 	inactl = 0;
 	inactmax = cnt.v_inactive_count;
 	actl = 0;
 	actmax = tries < 2 ? 0 : cnt.v_active_count;
 	dom = initial_dom;
 
 	/*
 	 * Scan domains in round-robin order, first inactive queues,
 	 * then active.  Since domain usually owns large physically
 	 * contiguous chunk of memory, it makes sense to completely
 	 * exhaust one domain before switching to next, while growing
 	 * the pool of contiguous physical pages.
 	 *
 	 * Do not even start launder a domain which cannot contain
 	 * the specified address range, as indicated by segments
 	 * constituting the domain.
 	 */
 again:
 	if (inactl < inactmax) {
 		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
 		    low, high) &&
 		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
 		    tries, low, high)) {
 			inactl++;
 			goto again;
 		}
 		if (++dom == vm_ndomains)
 			dom = 0;
 		if (dom != initial_dom)
 			goto again;
 	}
 	if (actl < actmax) {
 		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
 		    low, high) &&
 		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
 		      tries, low, high)) {
 			actl++;
 			goto again;
 		}
 		if (++dom == vm_ndomains)
 			dom = 0;
 		if (dom != initial_dom)
 			goto again;
 	}
 }
 
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	Deactivate enough pages to satisfy the inactive target
  *	requirements.
  *
  *	The object and map must be locked.
  */
 static void
 vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
     long desired)
 {
 	vm_object_t backing_object, object;
 	vm_page_t p;
 	int act_delta, remove_mode;
 
 	VM_OBJECT_ASSERT_LOCKED(first_object);
 	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
 		VM_OBJECT_ASSERT_LOCKED(object);
 		if ((object->flags & OBJ_UNMANAGED) != 0 ||
 		    object->paging_in_progress != 0)
 			goto unlock_return;
 
 		remove_mode = 0;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 		/*
 		 * Scan the object's entire memory queue.
 		 */
 		TAILQ_FOREACH(p, &object->memq, listq) {
 			if (pmap_resident_count(pmap) <= desired)
 				goto unlock_return;
 			if (vm_page_busied(p))
 				continue;
 			PCPU_INC(cnt.v_pdpages);
 			vm_page_lock(p);
 			if (p->wire_count != 0 || p->hold_count != 0 ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				vm_page_unlock(p);
 				continue;
 			}
 			act_delta = pmap_ts_referenced(p);
 			if ((p->aflags & PGA_REFERENCED) != 0) {
 				if (act_delta == 0)
 					act_delta = 1;
 				vm_page_aflag_clear(p, PGA_REFERENCED);
 			}
 			if (p->queue != PQ_ACTIVE && act_delta != 0) {
 				vm_page_activate(p);
 				p->act_count += act_delta;
 			} else if (p->queue == PQ_ACTIVE) {
 				if (act_delta == 0) {
 					p->act_count -= min(p->act_count,
 					    ACT_DECLINE);
 					if (!remove_mode && p->act_count == 0) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
 					} else
 						vm_page_requeue(p);
 				} else {
 					vm_page_activate(p);
 					if (p->act_count < ACT_MAX -
 					    ACT_ADVANCE)
 						p->act_count += ACT_ADVANCE;
 					vm_page_requeue(p);
 				}
 			} else if (p->queue == PQ_INACTIVE)
 				pmap_remove_all(p);
 			vm_page_unlock(p);
 		}
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
 		VM_OBJECT_RLOCK(backing_object);
 		if (object != first_object)
 			VM_OBJECT_RUNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
 		VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	long desired;
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 	int nothingwired;
 
 	if (!vm_map_trylock(map))
 		return;
 
 	bigobj = NULL;
 	nothingwired = TRUE;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count < obj->resident_page_count)) {
 					if (bigobj != NULL)
 						VM_OBJECT_RUNLOCK(bigobj);
 					bigobj = obj;
 				} else
 					VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
 			nothingwired = FALSE;
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj != NULL) {
 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
 		VM_OBJECT_RUNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
 				VM_OBJECT_RLOCK(obj);
 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
 				VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 #ifdef __ia64__
 	/*
 	 * Remove all non-wired, managed mappings if a process is swapped out.
 	 * This will free page table pages.
 	 */
 	if (desired == 0)
 		pmap_remove_pages(map->pmap);
 #else
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0 && nothingwired) {
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 	}
 #endif
 
 	vm_map_unlock(map);
 }
 #endif		/* !defined(NO_SWAPPING) */
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  *
  *	pass 0 - Update active LRU/deactivate pages
  *	pass 1 - Move inactive to cache or free
  *	pass 2 - Launder dirty pages
  */
 static void
 vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 	int lockmode;
 	boolean_t queues_locked;
 
 	/*
 	 * If we need to reclaim memory ask kernel caches to return
 	 * some.  We rate limit to avoid thrashing.
 	 */
 	if (vmd == &vm_dom[0] && pass > 0 &&
 	    (ticks - lowmem_ticks) / hz >= lowmem_period) {
 		/*
 		 * Decrease registered cache sizes.
 		 */
 		SDT_PROBE0(vm, , , vm__lowmem_scan);
 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
 		/*
 		 * We do this explicitly after the caches have been
 		 * drained above.
 		 */
 		uma_reclaim();
 		lowmem_ticks = ticks;
 	}
 
 	/*
 	 * The addl_page_shortage is the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
 	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = 0;
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
 	if (pass > 0) {
 		deficit = atomic_readandclear_int(&vm_pageout_deficit);
 		page_shortage = vm_paging_target() + deficit;
 	} else
 		page_shortage = deficit = 0;
 
 	/*
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
 	 * to disk can result in extreme performance degredation.  However,
 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
 	 * used) will die horribly with limited laundering.  If the pageout
 	 * daemon cannot clean enough pages in the first pass, we let it go
 	 * all out in succeeding passes.
 	 */
 	if ((maxlaunder = vm_max_launder) <= 1)
 		maxlaunder = 1;
 	if (pass > 1)
 		maxlaunder = 10000;
 
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.  Note that m->act_count
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 */
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	maxscan = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
 	queues_locked = TRUE;
 	for (m = TAILQ_FIRST(&pq->pq_pl);
 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
 	     m = next) {
 		vm_pagequeue_assert_locked(pq);
 		KASSERT(queues_locked, ("unlocked queues"));
 		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
 
 		PCPU_INC(cnt.v_pdpages);
 		next = TAILQ_NEXT(m, plinks.q);
 
 		/*
 		 * skip marker pages
 		 */
 		if (m->flags & PG_MARKER)
 			continue;
 
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("Fictitious page %p cannot be in inactive queue", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("Unmanaged page %p cannot be in inactive queue", m));
 
 		/*
 		 * The page or object lock acquisitions fail if the
 		 * page was removed from the queue or moved to a
 		 * different position within the queue.  In either
 		 * case, addl_page_shortage should not be incremented.
 		 */
 		if (!vm_pageout_page_lock(m, &next)) {
 			vm_page_unlock(m);
 			continue;
 		}
 		object = m->object;
 		if (!VM_OBJECT_TRYWLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 			continue;
 		}
 
 		/*
 		 * Don't mess with busy pages, keep them at at the
 		 * front of the queue, most likely they are being
 		 * paged out.  Increment addl_page_shortage for busy
 		 * pages, because they may leave the inactive queue
 		 * shortly after page scan is finished.
 		 */
 		if (vm_page_busied(m)) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * We unlock the inactive page queue, invalidating the
 		 * 'next' pointer.  Use our marker to remember our
 		 * place.
 		 */
 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
 		vm_pagequeue_unlock(pq);
 		queues_locked = FALSE;
 
 		/*
 		 * We bump the activation count if the page has been
 		 * referenced while in the inactive queue.  This makes
 		 * it less likely that the page will be added back to the
 		 * inactive queue prematurely again.  Here we check the 
 		 * page tables (or emulated bits, if any), given the upper 
 		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
 		act_delta = 0;
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta = 1;
 		}
 		if (object->ref_count != 0) {
 			act_delta += pmap_ts_referenced(m);
 		} else {
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("vm_pageout_scan: page %p is mapped", m));
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
 		 * references, we reactivate the page or requeue it.
 		 */
 		if (act_delta != 0) {
 			if (object->ref_count) {
 				vm_page_activate(m);
 				m->act_count += act_delta + ACT_ADVANCE;
 			} else {
 				vm_pagequeue_lock(pq);
 				queues_locked = TRUE;
 				vm_page_requeue_locked(m);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_unlock(m);
 			goto relock_queues;
 		}
 
 		if (m->hold_count != 0) {
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * Held pages are essentially stuck in the
 			 * queue.  So, they ought to be discounted
 			 * from the inactive count.  See the
 			 * calculation of the page_shortage for the
 			 * loop over the active queue below.
 			 */
 			addl_page_shortage++;
 			goto relock_queues;
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of placing it onto the cache queue.  If,
 		 * however, any of the page's mappings allow write access,
 		 * then the page may still be modified until the last of those
 		 * mappings are removed.
 		 */
 		vm_page_test_dirty(m);
 		if (m->dirty == 0 && object->ref_count != 0)
 			pmap_remove_all(m);
 
 		if (m->valid == 0) {
 			/*
 			 * Invalid pages can be easily freed
 			 */
 			vm_page_free(m);
 			PCPU_INC(cnt.v_dfree);
 			--page_shortage;
 		} else if (m->dirty == 0) {
 			/*
 			 * Clean pages can be placed onto the cache queue.
 			 * This effectively frees them.
 			 */
 			vm_page_cache(m);
 			--page_shortage;
 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
 			 * a page is extremely expensive verses freeing
 			 * a clean page.  Rather then artificially limiting
 			 * the number of pages we can flush, we instead give
 			 * dirty pages extra priority on the inactive queue
 			 * by forcing them to be cycled through the queue
 			 * twice before being flushed, after which the
 			 * (now clean) page will cycle through once more
 			 * before being freed.  This significantly extends
 			 * the thrash point for a heavily loaded machine.
 			 */
 			m->flags |= PG_WINATCFLS;
 			vm_pagequeue_lock(pq);
 			queues_locked = TRUE;
 			vm_page_requeue_locked(m);
 		} else if (maxlaunder > 0) {
 			/*
 			 * We always want to try to flush some dirty pages if
 			 * we encounter them, to keep the system stable.
 			 * Normally this number is small, but under extreme
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp = NULL;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
 				vm_page_count_min());
 										
 			}
 
 			/*
 			 * We don't bother paging objects that are "dead".  
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				vm_pagequeue_lock(pq);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(object);
 				queues_locked = TRUE;
 				vm_page_requeue_locked(m);
 				goto relock_queues;
 			}
 
 			/*
 			 * The object is already known NOT to be dead.   It
 			 * is possible for the vget() to block the whole
 			 * pageout daemon, but the new low-memory handling
 			 * code should prevent it.
 			 *
 			 * The previous code skipped locked vnodes and, worse,
 			 * reordered pages in the queue.  This results in
 			 * completely non-deterministic operation and, on a
 			 * busy system, can lead to extremely non-optimal
 			 * pageouts.  For example, it can cause clean pages
 			 * to be freed and dirty pages to be moved to the end
 			 * of the queue.  Since dirty pages are also moved to
 			 * the end of the queue once-cleaned, this gives
 			 * way too large a weighting to defering the freeing
 			 * of dirty pages.
 			 *
 			 * We can't wait forever for the vnode lock, we might
 			 * deadlock due to a vn_read() getting stuck in
 			 * vm_wait while holding this vnode.  We skip the 
 			 * vnode if we can't get it in a reasonable amount
 			 * of time.
 			 */
 			if (object->type == OBJT_VNODE) {
 				vm_page_unlock(m);
 				vp = object->handle;
 				if (vp->v_type == VREG &&
 				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 					mp = NULL;
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 				KASSERT(mp != NULL,
 				    ("vp %p with NULL v_mount", vp));
 				vm_object_reference_locked(object);
 				VM_OBJECT_WUNLOCK(object);
 				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
 				    LK_SHARED : LK_EXCLUSIVE;
 				if (vget(vp, lockmode | LK_TIMELOCK,
 				    curthread)) {
 					VM_OBJECT_WLOCK(object);
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
 					goto unlock_and_continue;
 				}
 				VM_OBJECT_WLOCK(object);
 				vm_page_lock(m);
 				vm_pagequeue_lock(pq);
 				queues_locked = TRUE;
 				/*
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
 				 * above.  The page might have been freed and
 				 * reused for another vnode.
 				 */
 				if (m->queue != PQ_INACTIVE ||
 				    m->object != object ||
 				    TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
 					vm_page_unlock(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 	
 				/*
 				 * The page may have been busied during the
 				 * blocking in vget().  We don't move the
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
 				if (vm_page_busied(m)) {
 					vm_page_unlock(m);
 					addl_page_shortage++;
 					goto unlock_and_continue;
 				}
 
 				/*
 				 * If the page has become held it might
 				 * be undergoing I/O, so skip it
 				 */
 				if (m->hold_count != 0) {
 					vm_page_unlock(m);
 					addl_page_shortage++;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 				vm_pagequeue_unlock(pq);
 				queues_locked = FALSE;
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation. 
 			 *
 			 * decrement page_shortage on success to account for
 			 * the (future) cleaned page.  Otherwise we could wind
 			 * up laundering or cleaning too many pages.
 			 */
 			if (vm_pageout_clean(m) != 0) {
 				--page_shortage;
 				--maxlaunder;
 			}
 unlock_and_continue:
 			vm_page_lock_assert(m, MA_NOTOWNED);
 			VM_OBJECT_WUNLOCK(object);
 			if (mp != NULL) {
 				if (queues_locked) {
 					vm_pagequeue_unlock(pq);
 					queues_locked = FALSE;
 				}
 				if (vp != NULL)
 					vput(vp);
 				vm_object_deallocate(object);
 				vn_finished_write(mp);
 			}
 			vm_page_lock_assert(m, MA_NOTOWNED);
 			goto relock_queues;
 		}
 		vm_page_unlock(m);
 		VM_OBJECT_WUNLOCK(object);
 relock_queues:
 		if (!queues_locked) {
 			vm_pagequeue_lock(pq);
 			queues_locked = TRUE;
 		}
 		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
 		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
 	}
 	vm_pagequeue_unlock(pq);
 
 #if !defined(NO_SWAPPING)
 	/*
 	 * Wakeup the swapout daemon if we didn't cache or free the targeted
 	 * number of pages. 
 	 */
 	if (vm_swap_enabled && page_shortage > 0)
 		vm_req_vmdaemon(VM_SWAP_NORMAL);
 #endif
 
 	/*
 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
 	 * and we didn't cache or free enough pages.
 	 */
 	if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
 	    cnt.v_free_min)
 		(void)speedup_syncer();
 
 	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
 	    vm_paging_target() + deficit + addl_page_shortage;
 
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 	maxscan = pq->pq_cnt;
 
 	/*
 	 * If we're just idle polling attempt to visit every
 	 * active page within 'update_period' seconds.
 	 */
 	if (pass == 0 && vm_pageout_update_period != 0) {
 		maxscan /= vm_pageout_update_period;
 		page_shortage = maxscan;
 	}
 
 	/*
 	 * Scan the active queue for things we can deactivate. We nominally
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
 	m = TAILQ_FIRST(&pq->pq_pl);
 	while (m != NULL && maxscan-- > 0 && page_shortage > 0) {
 
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) != 0) {
 			m = next;
 			continue;
 		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("Fictitious page %p cannot be in active queue", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("Unmanaged page %p cannot be in active queue", m));
 		if (!vm_pageout_page_lock(m, &next)) {
 			vm_page_unlock(m);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * The count for pagedaemon pages is done after checking the
 		 * page for eligibility...
 		 */
 		PCPU_INC(cnt.v_pdpages);
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
 		act_delta = 0;
 		if (m->aflags & PGA_REFERENCED) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta += 1;
 		}
 		/*
 		 * Unlocked object ref count check.  Two races are possible.
 		 * 1) The ref was transitioning to zero and we saw non-zero,
 		 *    the pmap bits will be checked unnecessarily.
 		 * 2) The ref was transitioning to one and we saw zero. 
 		 *    The page lock prevents a new reference to this page so
 		 *    we need not check the reference bits.
 		 */
 		if (m->object->ref_count != 0)
 			act_delta += pmap_ts_referenced(m);
 
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
 		if (act_delta) {
 			m->act_count += ACT_ADVANCE + act_delta;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			act_delta = m->act_count;
 		}
 
 		/*
 		 * Move this page to the tail of the active or inactive
 		 * queue depending on usage.
 		 */
 		if (act_delta == 0) {
 			/* Dequeue to avoid later lock recursion. */
 			vm_page_dequeue_locked(m);
 			vm_page_deactivate(m);
 			page_shortage--;
 		} else
 			vm_page_requeue_locked(m);
 		vm_page_unlock(m);
 		m = next;
 	}
 	vm_pagequeue_unlock(pq);
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
 	 */
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
 			vm_req_vmdaemon(VM_SWAP_IDLE);
 			lsec = time_second;
 		}
 	}
 #endif
 
 	/*
 	 * If we are critically low on one of RAM or swap and low on
 	 * the other, kill the largest process.  However, we avoid
 	 * doing this on the first pass in order to give ourselves a
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 */
 	vm_pageout_mightbe_oom(vmd, pass);
 }
 
 static int vm_pageout_oom_vote;
 
 /*
  * The pagedaemon threads randlomly select one to perform the
  * OOM.  Trying to kill processes before all pagedaemons
  * failed to reach free target is premature.
  */
 static void
 vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
 {
 	int old_vote;
 
 	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
 	    (swap_pager_full && vm_paging_target() > 0))) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = FALSE;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
 		}
 		return;
 	}
 
 	if (vmd->vmd_oom)
 		return;
 
 	vmd->vmd_oom = TRUE;
 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 	if (old_vote != vm_ndomains - 1)
 		return;
 
 	/*
 	 * The current pagedaemon thread is the last in the quorum to
 	 * start OOM.  Initiate the selection and signaling of the
 	 * victim.
 	 */
 	vm_pageout_oom(VM_OOM_MEM);
 
 	/*
 	 * After one round of OOM terror, recall our vote.  On the
 	 * next pass, current pagedaemon would vote again if the low
 	 * memory condition is still there, due to vmd_oom being
 	 * false.
 	 */
 	vmd->vmd_oom = FALSE;
 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
 }
 
 void
 vm_pageout_oom(int shortage)
 {
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	struct thread *td;
 	struct vmspace *vm;
 
 	/*
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of it's child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	bigproc = NULL;
 	bigsize = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		int breakout;
 
 		PROC_LOCK(p);
 
 		/*
 		 * If this is a system, protected or killed process, skip it.
 		 */
 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 		    p->p_pid == 1 || P_KILLED(p) ||
 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * If the process is in a non-running type state,
 		 * don't touch it.  Check all the threads individually.
 		 */
 		breakout = 0;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (!TD_ON_RUNQ(td) &&
 			    !TD_IS_RUNNING(td) &&
 			    !TD_IS_SLEEPING(td) &&
 			    !TD_IS_SUSPENDED(td)) {
 				thread_unlock(td);
 				breakout = 1;
 				break;
 			}
 			thread_unlock(td);
 		}
 		if (breakout) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * get the process size
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD(p);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
 			_PRELE(p);
 			PROC_UNLOCK(p);
 			vmspace_free(vm);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		size = vmspace_swap_count(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		if (shortage == VM_OOM_MEM)
 			size += vmspace_resident_count(vm);
 		vmspace_free(vm);
 		/*
 		 * if the this process is bigger than the biggest one
 		 * remember it.
 		 */
 		if (size > bigsize) {
 			if (bigproc != NULL)
 				PRELE(bigproc);
 			bigproc = p;
 			bigsize = size;
 		} else {
 			PRELE(p);
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
 		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
 		wakeup(&cnt.v_free_count);
 	}
 }
 
 static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *domain;
 	int domidx;
 
 	domidx = (uintptr_t)arg;
 	domain = &vm_dom[domidx];
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
 	 * the cores belonging to the domain, from which vm_page_array
 	 * is allocated.
 	 */
 
 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
 	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
 		/*
 		 * If we have enough free memory, wakeup waiters.  Do
 		 * not clear vm_pages_needed until we reach our target,
 		 * otherwise we may be woken up over and over again and
 		 * waste a lot of cpu.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
 			if (!vm_paging_needed())
 				vm_pages_needed = 0;
 			wakeup(&cnt.v_free_count);
 		}
 		if (vm_pages_needed) {
 			/*
 			 * Still not done, take a second pass without waiting
 			 * (unlimited dirty cleaning), otherwise sleep a bit
 			 * and try again.
 			 */
 			if (domain->vmd_pass > 1)
 				msleep(&vm_pages_needed,
 				    &vm_page_queue_free_mtx, PVM, "psleep",
 				    hz / 2);
 		} else {
 			/*
 			 * Good enough, sleep until required to refresh
 			 * stats.
 			 */
 			domain->vmd_pass = 0;
 			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
 			    PVM, "psleep", hz);
 
 		}
 		if (vm_pages_needed) {
 			cnt.v_pdwakeups++;
 			domain->vmd_pass++;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 		vm_pageout_scan(domain, domain->vmd_pass);
 	}
 }
 
 /*
  *	vm_pageout_init initialises basic pageout daemon settings.
  */
 static void
 vm_pageout_init(void)
 {
 	/*
 	 * Initialize some paging parameters.
 	 */
 	cnt.v_interrupt_free_min = 2;
 	if (cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	if (cnt.v_page_count > 1024)
 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
 	else
 		cnt.v_free_min = 4;
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 	    cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
 	cnt.v_free_severe = cnt.v_free_min / 2;
 	cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
 	cnt.v_free_min += cnt.v_free_reserved;
 	cnt.v_free_severe += cnt.v_free_reserved;
 	cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
 	if (cnt.v_inactive_target > cnt.v_free_count / 3)
 		cnt.v_inactive_target = cnt.v_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% above the minimum
 	 * page limit.  This keeps the steady state out of shortfall.
 	 */
 	vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
 	 * page at least once every ten minutes.  This is to prevent worst
 	 * case paging behaviors with stale active LRU.
 	 */
 	if (vm_pageout_update_period == 0)
 		vm_pageout_update_period = 600;
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
 }
 
 /*
  *     vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
 	int error;
 #if MAXMEMDOM > 1
 	int i;
 #endif
 
 	swap_pager_swap_init();
 #if MAXMEMDOM > 1
 	for (i = 1; i < vm_ndomains; i++) {
 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
 		    curproc, NULL, 0, 0, "dom%d", i);
 		if (error != 0) {
 			panic("starting pageout for domain %d, error %d\n",
 			    i, error);
 		}
 	}
 #endif
 	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
 	    0, 0, "uma");
 	if (error != 0)
 		panic("starting uma_reclaim helper, error %d\n", error);
 	vm_pageout_worker((void *)(uintptr_t)0);
 }
 
 /*
  * Unless the free page queue lock is held by the caller, this function
  * should be regarded as advisory.  Specifically, the caller should
  * not msleep() on &cnt.v_free_count following this function unless
  * the free page queue lock is held until the msleep() is performed.
  */
 void
 pagedaemon_wakeup(void)
 {
 
 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 }
 
 #if !defined(NO_SWAPPING)
 static void
 vm_req_vmdaemon(int req)
 {
 	static int lastrun = 0;
 
 	mtx_lock(&vm_daemon_mtx);
 	vm_pageout_req_swapout |= req;
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 	mtx_unlock(&vm_daemon_mtx);
 }
 
 static void
 vm_daemon(void)
 {
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
 	struct vmspace *vm;
 	int breakout, swapout_flags, tryagain, attempts;
 #ifdef RACCT
 	uint64_t rsize, ravailable;
 #endif
 
 	while (TRUE) {
 		mtx_lock(&vm_daemon_mtx);
+		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
 #ifdef RACCT
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz);
+		    racct_enable ? hz : 0
 #else
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
+		    0
 #endif
+		);
 		swapout_flags = vm_pageout_req_swapout;
 		vm_pageout_req_swapout = 0;
 		mtx_unlock(&vm_daemon_mtx);
 		if (swapout_flags)
 			swapout_procs(swapout_flags);
 
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		tryagain = 0;
 		attempts = 0;
 again:
 		attempts++;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL ||
 			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td) &&
 				    !TD_IS_SUSPENDED(td)) {
 					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
 				thread_unlock(td);
 			}
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			lim_rlimit(p, RLIMIT_RSS, &rsslim);
 			limit = OFF_TO_IDX(
 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_flag & P_INMEM) == 0)
 				limit = 0;	/* XXX */
 			vm = vmspace_acquire_ref(p);
 			PROC_UNLOCK(p);
 			if (vm == NULL)
 				continue;
 
 			size = vmspace_resident_count(vm);
 			if (size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &vm->vm_map, limit);
 			}
 #ifdef RACCT
-			rsize = IDX_TO_OFF(size);
-			PROC_LOCK(p);
-			racct_set(p, RACCT_RSS, rsize);
-			ravailable = racct_get_available(p, RACCT_RSS);
-			PROC_UNLOCK(p);
-			if (rsize > ravailable) {
-				/*
-				 * Don't be overly aggressive; this might be
-				 * an innocent process, and the limit could've
-				 * been exceeded by some memory hog.  Don't
-				 * try to deactivate more than 1/4th of process'
-				 * resident set size.
-				 */
-				if (attempts <= 8) {
-					if (ravailable < rsize - (rsize / 4))
-						ravailable = rsize - (rsize / 4);
-				}
-				vm_pageout_map_deactivate_pages(
-				    &vm->vm_map, OFF_TO_IDX(ravailable));
-				/* Update RSS usage after paging out. */
-				size = vmspace_resident_count(vm);
+			if (racct_enable) {
 				rsize = IDX_TO_OFF(size);
 				PROC_LOCK(p);
 				racct_set(p, RACCT_RSS, rsize);
+				ravailable = racct_get_available(p, RACCT_RSS);
 				PROC_UNLOCK(p);
-				if (rsize > ravailable)
-					tryagain = 1;
+				if (rsize > ravailable) {
+					/*
+					 * Don't be overly aggressive; this
+					 * might be an innocent process,
+					 * and the limit could've been exceeded
+					 * by some memory hog.  Don't try
+					 * to deactivate more than 1/4th
+					 * of process' resident set size.
+					 */
+					if (attempts <= 8) {
+						if (ravailable < rsize -
+						    (rsize / 4)) {
+							ravailable = rsize -
+							    (rsize / 4);
+						}
+					}
+					vm_pageout_map_deactivate_pages(
+					    &vm->vm_map,
+					    OFF_TO_IDX(ravailable));
+					/* Update RSS usage after paging out. */
+					size = vmspace_resident_count(vm);
+					rsize = IDX_TO_OFF(size);
+					PROC_LOCK(p);
+					racct_set(p, RACCT_RSS, rsize);
+					PROC_UNLOCK(p);
+					if (rsize > ravailable)
+						tryagain = 1;
+				}
 			}
 #endif
 			vmspace_free(vm);
 		}
 		sx_sunlock(&allproc_lock);
 		if (tryagain != 0 && attempts <= 10)
 			goto again;
 	}
 }
 #endif			/* !defined(NO_SWAPPING) */
Index: stable/10/sys/vm/vm_unix.c
===================================================================
--- stable/10/sys/vm/vm_unix.c	(revision 284664)
+++ stable/10/sys/vm/vm_unix.c	(revision 284665)
@@ -1,246 +1,257 @@
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
  *
  *	@(#)vm_unix.c	8.1 (Berkeley) 6/11/93
  */
 
 #include "opt_compat.h"
 
 /*
  * Traditional sbrk/grow interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct obreak_args {
 	char *nsize;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_obreak(td, uap)
 	struct thread *td;
 	struct obreak_args *uap;
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t new, old, base;
 	rlim_t datalim, lmemlim, vmemlim;
 	int prot, rv;
 	int error = 0;
 	boolean_t do_map_wirefuture;
 
 	PROC_LOCK(td->td_proc);
 	datalim = lim_cur(td->td_proc, RLIMIT_DATA);
 	lmemlim = lim_cur(td->td_proc, RLIMIT_MEMLOCK);
 	vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
 	PROC_UNLOCK(td->td_proc);
 
 	do_map_wirefuture = FALSE;
 	new = round_page((vm_offset_t)uap->nsize);
 	vm_map_lock(map);
 
 	base = round_page((vm_offset_t) vm->vm_daddr);
 	old = base + ctob(vm->vm_dsize);
 	if (new > base) {
 		/*
 		 * Check the resource limit, but allow a process to reduce
 		 * its usage, even if it remains over the limit.
 		 */
 		if (new - base > datalim && new > old) {
 			error = ENOMEM;
 			goto done;
 		}
 		if (new > vm_map_max(map)) {
 			error = ENOMEM;
 			goto done;
 		}
 	} else if (new < base) {
 		/*
 		 * This is simply an invalid value.  If someone wants to
 		 * do fancy address space manipulations, mmap and munmap
 		 * can do most of what the user would want.
 		 */
 		error = EINVAL;
 		goto done;
 	}
 	if (new > old) {
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) +
 			    (new - old) > lmemlim) {
 				error = ENOMEM;
 				goto done;
 			}
 		}
 		if (map->size + (new - old) > vmemlim) {
 			error = ENOMEM;
 			goto done;
 		}
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		error = racct_set(td->td_proc, RACCT_DATA, new - base);
-		if (error != 0) {
-			PROC_UNLOCK(td->td_proc);
-			error = ENOMEM;
-			goto done;
-		}
-		error = racct_set(td->td_proc, RACCT_VMEM,
-		    map->size + (new - old));
-		if (error != 0) {
-			racct_set_force(td->td_proc, RACCT_DATA, old - base);
-			PROC_UNLOCK(td->td_proc);
-			error = ENOMEM;
-			goto done;
-		}
-		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-			error = racct_set(td->td_proc, RACCT_MEMLOCK,
-			    ptoa(pmap_wired_count(map->pmap)) + (new - old));
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			error = racct_set(td->td_proc, RACCT_DATA, new - base);
 			if (error != 0) {
+				PROC_UNLOCK(td->td_proc);
+				error = ENOMEM;
+				goto done;
+			}
+			error = racct_set(td->td_proc, RACCT_VMEM,
+			    map->size + (new - old));
+			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_DATA,
 				    old - base);
-				racct_set_force(td->td_proc, RACCT_VMEM,
-				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				error = ENOMEM;
 				goto done;
 			}
+			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+				error = racct_set(td->td_proc, RACCT_MEMLOCK,
+				    ptoa(pmap_wired_count(map->pmap)) +
+				    (new - old));
+				if (error != 0) {
+					racct_set_force(td->td_proc, RACCT_DATA,
+					    old - base);
+					racct_set_force(td->td_proc, RACCT_VMEM,
+					    map->size);
+					PROC_UNLOCK(td->td_proc);
+					error = ENOMEM;
+					goto done;
+				}
+			}
+			PROC_UNLOCK(td->td_proc);
 		}
-		PROC_UNLOCK(td->td_proc);
 #endif
 		prot = VM_PROT_RW;
 #ifdef COMPAT_FREEBSD32
 #if defined(__amd64__) || defined(__ia64__)
 		if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
 			prot |= VM_PROT_EXECUTE;
 #endif
 #endif
 		rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 #ifdef RACCT
-			PROC_LOCK(td->td_proc);
-			racct_set_force(td->td_proc, RACCT_DATA, old - base);
-			racct_set_force(td->td_proc, RACCT_VMEM, map->size);
-			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-				racct_set_force(td->td_proc, RACCT_MEMLOCK,
-				    ptoa(pmap_wired_count(map->pmap)));
+			if (racct_enable) {
+				PROC_LOCK(td->td_proc);
+				racct_set_force(td->td_proc,
+				    RACCT_DATA, old - base);
+				racct_set_force(td->td_proc,
+				    RACCT_VMEM, map->size);
+				if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+					racct_set_force(td->td_proc,
+					    RACCT_MEMLOCK,
+					    ptoa(pmap_wired_count(map->pmap)));
+				}
+				PROC_UNLOCK(td->td_proc);
 			}
-			PROC_UNLOCK(td->td_proc);
 #endif
 			error = ENOMEM;
 			goto done;
 		}
 		vm->vm_dsize += btoc(new - old);
 		/*
 		 * Handle the MAP_WIREFUTURE case for legacy applications,
 		 * by marking the newly mapped range of pages as wired.
 		 * We are not required to perform a corresponding
 		 * vm_map_unwire() before vm_map_delete() below, as
 		 * it will forcibly unwire the pages in the range.
 		 *
 		 * XXX If the pages cannot be wired, no error is returned.
 		 */
 		if ((map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) {
 			if (bootverbose)
 				printf("obreak: MAP_WIREFUTURE set\n");
 			do_map_wirefuture = TRUE;
 		}
 	} else if (new < old) {
 		rv = vm_map_delete(map, new, old);
 		if (rv != KERN_SUCCESS) {
 			error = ENOMEM;
 			goto done;
 		}
 		vm->vm_dsize -= btoc(old - new);
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		racct_set_force(td->td_proc, RACCT_DATA, new - base);
-		racct_set_force(td->td_proc, RACCT_VMEM, map->size);
-		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-			racct_set_force(td->td_proc, RACCT_MEMLOCK,
-			    ptoa(pmap_wired_count(map->pmap)));
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_set_force(td->td_proc, RACCT_DATA, new - base);
+			racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+				racct_set_force(td->td_proc, RACCT_MEMLOCK,
+				    ptoa(pmap_wired_count(map->pmap)));
+			}
+			PROC_UNLOCK(td->td_proc);
 		}
-		PROC_UNLOCK(td->td_proc);
 #endif
 	}
 done:
 	vm_map_unlock(map);
 
 	if (do_map_wirefuture)
 		(void) vm_map_wire(map, old, new,
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ovadvise_args {
 	int anom;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_ovadvise(td, uap)
 	struct thread *td;
 	struct ovadvise_args *uap;
 {
 	/* START_GIANT_OPTIONAL */
 	/* END_GIANT_OPTIONAL */
 	return (EINVAL);
 }
Index: stable/10/usr.bin/rctl/rctl.8
===================================================================
--- stable/10/usr.bin/rctl/rctl.8	(revision 284664)
+++ stable/10/usr.bin/rctl/rctl.8	(revision 284665)
@@ -1,276 +1,288 @@
 .\"-
 .\" Copyright (c) 2009 Edward Tomasz Napierala
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd September 11, 2014
 .Dt RCTL 8
 .Os
 .Sh NAME
 .Nm rctl
 .Nd display and update resource limits database
 .Sh SYNOPSIS
 .Nm
 .Op Fl h
 .Op Fl n
 .Op Ar filter
 .Nm
 .Fl a
 .Op Ar rule
 .Nm
 .Fl l
 .Op Fl h
 .Op Fl n
 .Op Ar filter
 .Nm
 .Fl r
 .Op Ar filter
 .Nm
 .Fl u
 .Op Fl h
 .Op Ar filter
 .Pp
 .Nm
 requires the kernel to be compiled with:
 .Bd -ragged -offset indent
 .Cd "options RACCT"
 .Cd "options RCTL"
 .Ed
 .Sh DESCRIPTION
 When called without options, the
 .Nm
 command writes currently defined RCTL rules to standard output.
 .Pp
 If a
 .Ar filter
 argument is specified, only rules matching the filter are displayed.
 The options are as follows:
 .Bl -tag -width indent
 .It Fl a Ar rule
 Add
 .Ar rule
 to the RCTL database.
 .It Fl l Ar filter
 Display rules applicable to the process defined by
 .Ar filter .
 Note that this is different from showing the rules when called without
 any options, as it shows not just the rules with subject equal to that
 of process, but also rules for the user, jail, and login class applicable
 to the process.
 .It Fl r Ar filter
 Remove rules matching
 .Ar filter
 from the RCTL database.
 .It Fl u Ar filter
 Display resource usage for a subject
 .Po
 .Sy process ,
 .Sy user ,
 .Sy loginclass
 or
 .Sy jail
 .Pc
 matching the
 .Ar filter .
 .It Fl h
 "Human-readable" output.
 Use unit suffixes: Byte, Kilobyte, Megabyte,
 Gigabyte, Terabyte and Petabyte.
 .It Fl n
 Display user IDs numerically rather than converting them to a user name.
 .El
 .Pp
 Modifying rules affects all currently running and future processes matching
 the rule.
 .Sh RULE SYNTAX
 Syntax for a rule is subject:subject-id:resource:action=amount/per.
 .Pp
 .Bl -tag -width "subject-id" -compact -offset indent
 .It subject
 defines the kind of entity the rule applies to.
 It can be either
 .Sy process ,
 .Sy user ,
 .Sy loginclass ,
 or
 .Sy jail .
 .It subject-id
 identifies the
 .Em subject .
 It can be a process ID, user name, numerical user ID, login class name from
 .Xr login.conf 5 ,
 or jail name.
 .It resource
 identifies the resource the rule controls.
 See the
 .Sx RESOURCES
 section below for details.
 .It action
 defines what will happen when a process exceeds the allowed
 .Em amount .
 See the
 .Sx ACTIONS
 section below for details.
 .It amount
 defines how much of the resource a process can use before
 the defined
 .Em action
 triggers.
 Resources which limit bytes may use prefixes from
 .Xr expand_number 3 .
 .It per
 defines what entity the
 .Em amount
 gets accounted for.
 For example, rule "loginclass:users:vmem:deny=100M/process" means
 that each process of any user belonging to login class "users" may allocate
 up to 100MB of virtual memory.
 Rule "loginclass:users:vmem:deny=100M/user" would mean that for each
 user belonging to the login class "users", the sum of virtual memory allocated
 by all the processes of that user will not exceed 100MB.
 Rule "loginclass:users:vmem:deny=100M/loginclass" would mean that the sum of
 virtual memory allocated by all processes of all users belonging to that login
 class will not exceed 100MB.
 .El
 .Pp
 A valid rule has all those fields specified, except for
 .Em per ,
 which defaults
 to the value of
 .Em subject .
 .Pp
 A filter is a rule for which one of more fields other than
 .Em per
 is left empty.
 For example, a filter that matches every rule could be written as ":::=/",
 or, in short, ":".
 A filter that matches all the login classes would be "loginclass:".
 A filter that matches all defined rules for
 .Sy maxproc
 resource would be
 "::maxproc".
 .Sh SUBJECTS
 .Bl -column -offset 3n "pseudoterminals" ".Sy username or numerical User ID"
 .It Em subject Ta Em subject-id
 .It Sy process Ta numerical Process ID
 .It Sy user Ta user name or numerical User ID
 .It Sy loginclass Ta login class from
 .Xr login.conf 5
 .It Sy jail Ta jail name
 .El
 .Sh RESOURCES
 .Bl -column -offset 3n "pseudoterminals"
 .It Em resource
 .It Sy cputime Ta "CPU time, in seconds"
 .It Sy datasize Ta "data size, in bytes"
 .It Sy stacksize Ta "stack size, in bytes"
 .It Sy coredumpsize Ta "core dump size, in bytes"
 .It Sy memoryuse Ta "resident set size, in bytes"
 .It Sy memorylocked Ta "locked memory, in bytes"
 .It Sy maxproc Ta "number of processes"
 .It Sy openfiles Ta "file descriptor table size"
 .It Sy vmemoryuse Ta "address space limit, in bytes"
 .It Sy pseudoterminals Ta "number of PTYs"
 .It Sy swapuse Ta "swap usage, in bytes"
 .It Sy nthr Ta "number of threads"
 .It Sy msgqqueued Ta "number of queued SysV messages"
 .It Sy msgqsize Ta "SysV message queue size, in bytes"
 .It Sy nmsgq Ta "number of SysV message queues"
 .It Sy nsem Ta "number of SysV semaphores"
 .It Sy nsemop Ta "number of SysV semaphores modified in a single semop(2) call"
 .It Sy nshm Ta "number of SysV shared memory segments"
 .It Sy shmsize Ta "SysV shared memory size, in bytes"
 .It Sy wallclock Ta "wallclock time, in seconds"
 .It Sy pcpu Ta "%CPU, in percents of a single CPU core"
 .El
 .Sh ACTIONS
 .Bl -column -offset 3n "pseudoterminals"
 .It Em action
 .It Sy deny Ta deny the allocation; not supported for
 .Sy cputime
 and
 .Sy wallclock
 .It Sy log Ta "log a warning to the console"
 .It Sy devctl Ta "send notification to"
 .Xr devd 8
 using
 .Sy system
 = "RCTL",
 .Sy subsystem
 = "rule",
 .Sy type
 = "matched"
 .It sig*	e.g.
 .Sy sigterm ;
 send a signal to the offending process.
 See
 .Xr signal 3
 for a list of supported signals
 .El
 .Pp
 Not all actions are supported for all resources.
 Attempting to add a rule with an action not supported by a given resource will
 result in error.
+.Sh LOADER TUNABLES
+Tunables can be set at the
+.Xr loader 8
+prompt, or
+.Xr loader.conf 5 .
+.Bl -tag -width indent
+.It Va kern.racct.enable: No 1
+Enable
+.Nm .
+This defaults to 1, unless
+.Cd "options RACCT_DEFAULT_TO_DISABLED"
+is set in the kernel configuration file.
 .Sh EXIT STATUS
 .Ex -std
 .Sh EXAMPLES
 Prevent user "joe" from allocating more than 1GB of virtual memory:
 .Dl Nm Fl a Ar user:joe:vmemoryuse:deny=1g
 .Pp
 Remove all RCTL rules:
 .Dl Nm Fl r Ar \&:
 .Pp
 Display resource usage information for jail named "www":
 .Dl Nm Fl hu Ar jail:www
 .Pp
 Display all the rules applicable to process with PID 512:
 .Dl Nm Fl l Ar process:512
 .Pp
 Display all rules:
 .Dl Nm
 .Pp
 Display all rules matching user "joe":
 .Dl Nm Ar user:joe
 .Pp
 Display all rules matching login classes:
 .Dl Nm Ar loginclass:
 .Sh SEE ALSO
 .Xr rctl.conf 5
 .Sh HISTORY
 The
 .Nm
 command appeared in
 .Fx 9.0 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 was developed by
 .An Edward Tomasz Napierala Aq trasz@FreeBSD.org
 under sponsorship from the FreeBSD Foundation.
 .Sh BUGS
 Limiting
 .Sy memoryuse
 may kill the machine due to thrashing.
Index: stable/10
===================================================================
--- stable/10	(revision 284664)
+++ stable/10	(revision 284665)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r282213,282901