Index: stable/8/share/man/man9/Makefile =================================================================== --- stable/8/share/man/man9/Makefile (revision 225584) +++ stable/8/share/man/man9/Makefile (revision 225585) @@ -1,1384 +1,1385 @@ # $FreeBSD$ MAN= accept_filter.9 \ accf_data.9 \ accf_dns.9 \ accf_http.9 \ acl.9 \ alloc_unr.9 \ alq.9 \ altq.9 \ atomic.9 \ bios.9 \ boot.9 \ bpf.9 \ buf.9 \ BUF_ISLOCKED.9 \ BUF_LOCK.9 \ BUF_LOCKFREE.9 \ BUF_LOCKINIT.9 \ BUF_RECURSED.9 \ BUF_TIMELOCK.9 \ BUF_UNLOCK.9 \ bus_activate_resource.9 \ BUS_ADD_CHILD.9 \ bus_adjust_resource.9 \ bus_alloc_resource.9 \ BUS_BIND_INTR.9 \ bus_child_present.9 \ BUS_CONFIG_INTR.9 \ BUS_DESCRIBE_INTR.9 \ bus_dma.9 \ bus_generic_attach.9 \ bus_generic_detach.9 \ bus_generic_new_pass.9 \ bus_generic_print_child.9 \ bus_generic_read_ivar.9 \ bus_generic_shutdown.9 \ BUS_NEW_PASS.9 \ BUS_PRINT_CHILD.9 \ BUS_READ_IVAR.9 \ bus_release_resource.9 \ bus_set_pass.9 \ bus_set_resource.9 \ BUS_SETUP_INTR.9 \ bus_space.9 \ byteorder.9 \ cc.9 \ cd.9 \ condvar.9 \ config_intrhook.9 \ contigmalloc.9 \ copy.9 \ cr_cansee.9 \ critical_enter.9 \ cr_seeothergids.9 \ cr_seeotheruids.9 \ crypto.9 \ CTASSERT.9 \ DB_COMMAND.9 \ DECLARE_GEOM_CLASS.9 \ DECLARE_MODULE.9 \ DELAY.9 \ devclass.9 \ devclass_find.9 \ devclass_get_device.9 \ devclass_get_devices.9 \ devclass_get_drivers.9 \ devclass_get_maxunit.9 \ devclass_get_name.9 \ devclass_get_softc.9 \ dev_clone.9 \ devfs_set_cdevpriv.9 \ device.9 \ device_add_child.9 \ DEVICE_ATTACH.9 \ device_delete_child.9 \ DEVICE_DETACH.9 \ device_enable.9 \ device_find_child.9 \ device_get_children.9 \ device_get_devclass.9 \ device_get_driver.9 \ device_get_ivars.9 \ device_get_name.9 \ device_get_parent.9 \ device_get_softc.9 \ device_get_state.9 \ device_get_sysctl.9 \ device_get_unit.9 \ DEVICE_IDENTIFY.9 \ device_printf.9 \ DEVICE_PROBE.9 \ device_probe_and_attach.9 \ device_quiet.9 \ device_set_desc.9 \ device_set_driver.9 \ device_set_flags.9 \ DEVICE_SHUTDOWN.9 \ DEV_MODULE.9 \ devstat.9 \ devtoname.9 \ disk.9 \ domain.9 \ driver.9 \ DRIVER_MODULE.9 \ EVENTHANDLER.9 \ extattr.9 \ fail.9 \ fetch.9 \ firmware.9 \ g_access.9 \ g_attach.9 \ g_bio.9 \ g_consumer.9 \ g_data.9 \ get_cyclecount.9 \ getnewvnode.9 \ g_event.9 \ g_geom.9 \ g_provider.9 \ g_provider_by_name.9 \ groupmember.9 \ g_wither_geom.9 \ hash.9 \ hashinit.9 \ hexdump.9 \ hhook.9 \ ieee80211.9 \ ieee80211_amrr.9 \ ieee80211_beacon.9 \ ieee80211_bmiss.9 \ ieee80211_crypto.9 \ ieee80211_ddb.9 \ ieee80211_input.9 \ ieee80211_node.9 \ ieee80211_output.9 \ ieee80211_proto.9 \ ieee80211_radiotap.9 \ ieee80211_regdomain.9 \ ieee80211_scan.9 \ ieee80211_vap.9 \ ifnet.9 \ inittodr.9 \ insmntque.9 \ intro.9 \ ithread.9 \ KASSERT.9 \ kernacc.9 \ kernel_mount.9 \ khelp.9 \ kobj.9 \ kproc.9 \ kqueue.9 \ kthread.9 \ ktr.9 \ lock.9 \ locking.9 \ LOCK_PROFILING.9 \ mac.9 \ make_dev.9 \ malloc.9 \ mbchain.9 \ mbpool.9 \ mbuf.9 \ mbuf_tags.9 \ MD5.9 \ mdchain.9 \ memguard.9 \ microseq.9 \ microtime.9 \ microuptime.9 \ mi_switch.9 \ module.9 \ MODULE_DEPEND.9 \ MODULE_VERSION.9 \ mtx_pool.9 \ mutex.9 \ namei.9 \ netisr.9 \ osd.9 \ panic.9 \ pbuf.9 \ p_candebug.9 \ p_cansee.9 \ pci.9 \ pfil.9 \ pfind.9 \ pgfind.9 \ physio.9 \ pmap.9 \ pmap_activate.9 \ pmap_change_wiring.9 \ pmap_clear_modify.9 \ pmap_copy.9 \ pmap_enter.9 \ pmap_extract.9 \ pmap_growkernel.9 \ pmap_init.9 \ pmap_is_modified.9 \ pmap_is_prefaultable.9 \ pmap_map.9 \ pmap_mincore.9 \ pmap_object_init_pt.9 \ pmap_page_exists_quick.9 \ pmap_page_init.9 \ pmap_page_protect.9 \ pmap_pinit.9 \ pmap_qenter.9 \ pmap_release.9 \ pmap_remove.9 \ pmap_resident_count.9 \ pmap_zero_page.9 \ printf.9 \ prison_check.9 \ priv.9 \ pseudofs.9 \ psignal.9 \ random.9 \ random_harvest.9 \ redzone.9 \ refcount.9 \ resettodr.9 \ resource_int_value.9 \ rijndael.9 \ rman.9 \ rmlock.9 \ rtalloc.9 \ rtentry.9 \ runqueue.9 \ rwlock.9 \ sbuf.9 \ scheduler.9 \ securelevel_gt.9 \ selrecord.9 \ sema.9 \ sf_buf.9 \ sglist.9 \ signal.9 \ sleep.9 \ sleepqueue.9 \ socket.9 \ spl.9 \ stack.9 \ store.9 \ style.9 \ swi.9 \ sx.9 \ SYSCALL_MODULE.9 \ sysctl.9 \ sysctl_add_oid.9 \ sysctl_ctx_init.9 \ taskqueue.9 \ thread_exit.9 \ time.9 \ timeout.9 \ tvtohz.9 \ ucred.9 \ uidinfo.9 \ uio.9 \ usbdi.9 \ utopia.9 \ vaccess.9 \ vaccess_acl_nfs4.9 \ vaccess_acl_posix1e.9 \ vcount.9 \ vflush.9 \ VFS.9 \ vfs_busy.9 \ VFS_CHECKEXP.9 \ vfsconf.9 \ VFS_FHTOVP.9 \ vfs_getnewfsid.9 \ vfs_getopt.9 \ vfs_getvfs.9 \ VFS_LOCK_GIANT.9 \ VFS_MOUNT.9 \ vfs_mount.9 \ vfs_mountedfrom.9 \ VFS_QUOTACTL.9 \ VFS_ROOT.9 \ vfs_rootmountalloc.9 \ VFS_SET.9 \ VFS_STATFS.9 \ vfs_suser.9 \ VFS_SYNC.9 \ vfs_timestamp.9 \ vfs_unbusy.9 \ VFS_UNMOUNT.9 \ vfs_unmountall.9 \ VFS_VGET.9 \ vget.9 \ vgone.9 \ vhold.9 \ vinvalbuf.9 \ vm_fault_prefault.9 \ vm_map.9 \ vm_map_check_protection.9 \ vm_map_clean.9 \ vm_map_create.9 \ vm_map_delete.9 \ vm_map_entry_resize_free.9 \ vm_map_find.9 \ vm_map_findspace.9 \ vm_map_inherit.9 \ vm_map_init.9 \ vm_map_insert.9 \ vm_map_lock.9 \ vm_map_lookup.9 \ vm_map_madvise.9 \ vm_map_max.9 \ vm_map_protect.9 \ vm_map_remove.9 \ vm_map_simplify_entry.9 \ vm_map_stack.9 \ vm_map_submap.9 \ vm_map_wire.9 \ vm_page_alloc.9 \ vm_page_bits.9 \ vm_page_cache.9 \ vm_page_copy.9 \ vm_page_deactivate.9 \ vm_page_dontneed.9 \ vm_page_flag.9 \ vm_page_free.9 \ vm_page_grab.9 \ vm_page_hold.9 \ vm_page_insert.9 \ vm_page_io.9 \ vm_page_lookup.9 \ vm_page_protect.9 \ vm_page_rename.9 \ vm_page_sleep_if_busy.9 \ vm_page_wakeup.9 \ vm_page_wire.9 \ vm_page_zero_fill.9 \ vm_set_page_size.9 \ vn_fullpath.9 \ vn_isdisk.9 \ vnode.9 \ VOP_ACCESS.9 \ VOP_ACLCHECK.9 \ VOP_ADVLOCK.9 \ VOP_ATTRIB.9 \ VOP_BWRITE.9 \ VOP_CREATE.9 \ VOP_FSYNC.9 \ VOP_GETACL.9 \ VOP_GETEXTATTR.9 \ VOP_GETPAGES.9 \ VOP_GETVOBJECT.9 \ VOP_INACTIVE.9 \ VOP_IOCTL.9 \ VOP_LINK.9 \ VOP_LISTEXTATTR.9 \ VOP_LOCK.9 \ VOP_LOOKUP.9 \ VOP_OPENCLOSE.9 \ VOP_PATHCONF.9 \ VOP_PRINT.9 \ VOP_RDWR.9 \ VOP_READDIR.9 \ VOP_READLINK.9 \ VOP_REALLOCBLKS.9 \ VOP_REMOVE.9 \ VOP_RENAME.9 \ VOP_REVOKE.9 \ VOP_SETACL.9 \ VOP_SETEXTATTR.9 \ VOP_STRATEGY.9 \ VOP_VPTOCNP.9 \ VOP_VPTOFH.9 \ vref.9 \ vrefcnt.9 \ vrele.9 \ vslock.9 \ watchdog.9 \ zero_copy.9 \ zone.9 MLINKS= alloc_unr.9 alloc_unrl.9 \ alloc_unr.9 alloc_unr_specific.9 \ alloc_unr.9 delete_unrhdr.9 \ alloc_unr.9 free_unr.9 \ alloc_unr.9 new_unrhdr.9 MLINKS+=alq.9 ALQ.9 \ alq.9 alq_close.9 \ alq.9 alq_flush.9 \ alq.9 alq_get.9 \ alq.9 alq_open.9 \ alq.9 alq_post.9 \ alq.9 alq_write.9 MLINKS+=altq.9 ALTQ.9 MLINKS+=atomic.9 atomic_add.9 \ atomic.9 atomic_clear.9 \ atomic.9 atomic_cmpset.9 \ atomic.9 atomic_fetchadd.9 \ atomic.9 atomic_load.9 \ atomic.9 atomic_readandclear.9 \ atomic.9 atomic_set.9 \ atomic.9 atomic_store.9 \ atomic.9 atomic_subtract.9 MLINKS+=bpf.9 bpf_filter.9 \ bpf.9 bpf_mtap.9 \ bpf.9 bpf_mtap2.9 \ bpf.9 bpf_tap.9 \ bpf.9 bpf_validate.9 \ bpf.9 bpfattach.9 \ bpf.9 bpfattach2.9 \ bpf.9 bpfdetach.9 MLINKS+=buf.9 bp.9 MLINKS+=bus_activate_resource.9 bus_deactivate_resource.9 MLINKS+=bus_alloc_resource.9 bus_alloc_resource_any.9 MLINKS+=BUS_BIND_INTR.9 bus_bind_intr.9 MLINKS+=BUS_DESCRIBE_INTR.9 bus_describe_intr.9 MLINKS+=bus_dma.9 busdma.9 \ bus_dma.9 bus_dmamap_create.9 \ bus_dma.9 bus_dmamap_destroy.9 \ bus_dma.9 bus_dmamap_load.9 \ bus_dma.9 bus_dmamap_load_mbuf.9 \ bus_dma.9 bus_dmamap_load_mbuf_sg.9 \ bus_dma.9 bus_dmamap_load_uio.9 \ bus_dma.9 bus_dmamap_sync.9 \ bus_dma.9 bus_dmamap_unload.9 \ bus_dma.9 bus_dmamem_alloc.9 \ bus_dma.9 bus_dmamem_free.9 \ bus_dma.9 bus_dma_tag_create.9 \ bus_dma.9 bus_dma_tag_destroy.9 MLINKS+=bus_generic_read_ivar.9 bus_generic_write_ivar.9 MLINKS+=BUS_READ_IVAR.9 BUS_WRITE_IVAR.9 MLINKS+=BUS_SETUP_INTR.9 bus_setup_intr.9 \ BUS_SETUP_INTR.9 BUS_TEARDOWN_INTR.9 \ BUS_SETUP_INTR.9 bus_teardown_intr.9 MLINKS+=bus_space.9 bus_space_barrier.9 \ bus_space.9 bus_space_copy_region_1.9 \ bus_space.9 bus_space_copy_region_2.9 \ bus_space.9 bus_space_copy_region_4.9 \ bus_space.9 bus_space_copy_region_8.9 \ bus_space.9 bus_space_copy_region_stream_1.9 \ bus_space.9 bus_space_copy_region_stream_2.9 \ bus_space.9 bus_space_copy_region_stream_4.9 \ bus_space.9 bus_space_copy_region_stream_8.9 \ bus_space.9 bus_space_free.9 \ bus_space.9 bus_space_map.9 \ bus_space.9 bus_space_read_1.9 \ bus_space.9 bus_space_read_2.9 \ bus_space.9 bus_space_read_4.9 \ bus_space.9 bus_space_read_8.9 \ bus_space.9 bus_space_read_multi_1.9 \ bus_space.9 bus_space_read_multi_2.9 \ bus_space.9 bus_space_read_multi_4.9 \ bus_space.9 bus_space_read_multi_8.9 \ bus_space.9 bus_space_read_multi_stream_1.9 \ bus_space.9 bus_space_read_multi_stream_2.9 \ bus_space.9 bus_space_read_multi_stream_4.9 \ bus_space.9 bus_space_read_multi_stream_8.9 \ bus_space.9 bus_space_read_region_1.9 \ bus_space.9 bus_space_read_region_2.9 \ bus_space.9 bus_space_read_region_4.9 \ bus_space.9 bus_space_read_region_8.9 \ bus_space.9 bus_space_read_region_stream_1.9 \ bus_space.9 bus_space_read_region_stream_2.9 \ bus_space.9 bus_space_read_region_stream_4.9 \ bus_space.9 bus_space_read_region_stream_8.9 \ bus_space.9 bus_space_read_stream_1.9 \ bus_space.9 bus_space_read_stream_2.9 \ bus_space.9 bus_space_read_stream_4.9 \ bus_space.9 bus_space_read_stream_8.9 \ bus_space.9 bus_space_set_multi_1.9 \ bus_space.9 bus_space_set_multi_2.9 \ bus_space.9 bus_space_set_multi_4.9 \ bus_space.9 bus_space_set_multi_8.9 \ bus_space.9 bus_space_set_multi_stream_1.9 \ bus_space.9 bus_space_set_multi_stream_2.9 \ bus_space.9 bus_space_set_multi_stream_4.9 \ bus_space.9 bus_space_set_multi_stream_8.9 \ bus_space.9 bus_space_set_region_1.9 \ bus_space.9 bus_space_set_region_2.9 \ bus_space.9 bus_space_set_region_4.9 \ bus_space.9 bus_space_set_region_8.9 \ bus_space.9 bus_space_set_region_stream_1.9 \ bus_space.9 bus_space_set_region_stream_2.9 \ bus_space.9 bus_space_set_region_stream_4.9 \ bus_space.9 bus_space_set_region_stream_8.9 \ bus_space.9 bus_space_subregion.9 \ bus_space.9 bus_space_unmap.9 \ bus_space.9 bus_space_write_1.9 \ bus_space.9 bus_space_write_2.9 \ bus_space.9 bus_space_write_4.9 \ bus_space.9 bus_space_write_8.9 \ bus_space.9 bus_space_write_multi_1.9 \ bus_space.9 bus_space_write_multi_2.9 \ bus_space.9 bus_space_write_multi_4.9 \ bus_space.9 bus_space_write_multi_8.9 \ bus_space.9 bus_space_write_multi_stream_1.9 \ bus_space.9 bus_space_write_multi_stream_2.9 \ bus_space.9 bus_space_write_multi_stream_4.9 \ bus_space.9 bus_space_write_multi_stream_8.9 \ bus_space.9 bus_space_write_region_1.9 \ bus_space.9 bus_space_write_region_2.9 \ bus_space.9 bus_space_write_region_4.9 \ bus_space.9 bus_space_write_region_8.9 \ bus_space.9 bus_space_write_region_stream_1.9 \ bus_space.9 bus_space_write_region_stream_2.9 \ bus_space.9 bus_space_write_region_stream_4.9 \ bus_space.9 bus_space_write_region_stream_8.9 \ bus_space.9 bus_space_write_stream_1.9 \ bus_space.9 bus_space_write_stream_2.9 \ bus_space.9 bus_space_write_stream_4.9 \ bus_space.9 bus_space_write_stream_8.9 MLINKS+=byteorder.9 be16dec.9 \ byteorder.9 be16enc.9 \ byteorder.9 be16toh.9 \ byteorder.9 be32dec.9 \ byteorder.9 be32enc.9 \ byteorder.9 be32toh.9 \ byteorder.9 be64dec.9 \ byteorder.9 be64enc.9 \ byteorder.9 be64toh.9 \ byteorder.9 bswap16.9 \ byteorder.9 bswap32.9 \ byteorder.9 bswap64.9 \ byteorder.9 htobe16.9 \ byteorder.9 htobe32.9 \ byteorder.9 htobe64.9 \ byteorder.9 htole16.9 \ byteorder.9 htole32.9 \ byteorder.9 htole64.9 \ byteorder.9 le16dec.9 \ byteorder.9 le16enc.9 \ byteorder.9 le16toh.9 \ byteorder.9 le32dec.9 \ byteorder.9 le32enc.9 \ byteorder.9 le32toh.9 \ byteorder.9 le64dec.9 \ byteorder.9 le64enc.9 \ byteorder.9 le64toh.9 MLINKS+=condvar.9 cv_broadcast.9 \ condvar.9 cv_broadcastpri.9 \ condvar.9 cv_destroy.9 \ condvar.9 cv_init.9 \ condvar.9 cv_signal.9 \ condvar.9 cv_timedwait.9 \ condvar.9 cv_timedwait_sig.9 \ condvar.9 cv_wait.9 \ condvar.9 cv_wait_sig.9 \ condvar.9 cv_wait_unlock.9 \ condvar.9 cv_wmesg.9 MLINKS+=config_intrhook.9 config_intrhook_disestablish.9 \ config_intrhook.9 config_intrhook_establish.9 MLINKS+=contigmalloc.9 contigfree.9 MLINKS+=copy.9 copyin.9 \ copy.9 copyinstr.9 \ copy.9 copyout.9 \ copy.9 copystr.9 MLINKS+=critical_enter.9 critical.9 \ critical_enter.9 critical_exit.9 MLINKS+=crypto.9 crypto_dispatch.9 \ crypto.9 crypto_done.9 \ crypto.9 crypto_freereq.9 \ crypto.9 crypto_freesession.9 \ crypto.9 crypto_get_driverid.9 \ crypto.9 crypto_getreq.9 \ crypto.9 crypto_kdispatch.9 \ crypto.9 crypto_kdone.9 \ crypto.9 crypto_kregister.9 \ crypto.9 crypto_newsession.9 \ crypto.9 crypto_register.9 \ crypto.9 crypto_unblock.9 \ crypto.9 crypto_unregister.9 \ crypto.9 crypto_unregister_all.9 MLINKS+=DB_COMMAND.9 DB_SHOW_COMMAND.9 \ DB_COMMAND.9 DB_SHOW_ALL_COMMAND.9 MLINKS+=dev_clone.9 drain_dev_clone_events.9 MLINKS+=devfs_set_cdevpriv.9 devfs_get_cdevpriv.9 \ devfs_set_cdevpriv.9 devfs_clear_cdevpriv.9 MLINKS+=device_add_child.9 device_add_child_ordered.9 MLINKS+=device_enable.9 device_disable.9 \ device_enable.9 device_is_enabled.9 MLINKS+=device_get_ivars.9 device_set_ivars.9 MLINKS+=device_get_name.9 device_get_nameunit.9 MLINKS+=device_get_state.9 device_busy.9 \ device_get_state.9 device_is_alive.9 \ device_get_state.9 device_is_attached.9 \ device_get_state.9 device_unbusy.9 MLINKS+=device_get_sysctl.9 device_get_sysctl_ctx.9 \ device_get_sysctl.9 device_get_sysctl_tree.9 MLINKS+=device_quiet.9 device_is_quiet.9 \ device_quiet.9 device_verbose.9 MLINKS+=device_set_desc.9 device_get_desc.9 \ device_set_desc.9 device_set_desc_copy.9 MLINKS+=device_set_flags.9 device_get_flags.9 MLINKS+=devstat.9 devicestat.9 \ devstat.9 devstat_add_entry.9 \ devstat.9 devstat_end_transaction.9 \ devstat.9 devstat_remove_entry.9 \ devstat.9 devstat_start_transaction.9 MLINKS+=disk.9 disk_alloc.9 \ disk.9 disk_create.9 \ disk.9 disk_gone.9 \ disk.9 disk_destroy.9 MLINKS+=domain.9 DOMAIN_SET.9 \ domain.9 net_add_domain.9 \ domain.9 pfctlinput.9 \ domain.9 pfctlinput2.9 \ domain.9 pffindproto.9 \ domain.9 pffindtype.9 MLINKS+=DRIVER_MODULE.9 MULTI_DRIVER_MODULE.9 MLINKS+=EVENTHANDLER.9 EVENTHANDLER_DECLARE.9 \ EVENTHANDLER.9 EVENTHANDLER_DEREGISTER.9 \ EVENTHANDLER.9 eventhandler_deregister.9 \ EVENTHANDLER.9 eventhandler_find_list.9 \ EVENTHANDLER.9 EVENTHANDLER_INVOKE.9 \ EVENTHANDLER.9 eventhandler_prune_list.9 \ EVENTHANDLER.9 EVENTHANDLER_REGISTER.9 \ EVENTHANDLER.9 eventhandler_register.9 MLINKS+=fetch.9 fubyte.9 \ fetch.9 fuswintr.9 \ fetch.9 fuword.9 \ fetch.9 fuword16.9 \ fetch.9 fuword32.9 \ fetch.9 fuword64.9 MLINKS+=g_attach.9 g_detach.9 MLINKS+=g_bio.9 g_clone_bio.9 \ g_bio.9 g_destroy_bio.9 \ g_bio.9 g_new_bio.9 \ g_bio.9 g_print_bio.9 MLINKS+=g_consumer.9 g_destroy_consumer.9 \ g_consumer.9 g_new_consumer.9 MLINKS+=g_data.9 g_read_data.9 \ g_data.9 g_write_data.9 MLINKS+=g_event.9 g_cancel_event.9 \ g_event.9 g_post_event.9 \ g_event.9 g_waitfor_event.9 MLINKS+=g_geom.9 g_destroy_geom.9 \ g_geom.9 g_new_geomf.9 MLINKS+=g_provider.9 g_destroy_provider.9 \ g_provider.9 g_error_provider.9 \ g_provider.9 g_new_providerf.9 MLINKS+=hash.9 hash32.9 \ hash.9 hash32_buf.9 \ hash.9 hash32_str.9 \ hash.9 hash32_stre.9 \ hash.9 hash32_strn.9 \ hash.9 hash32_strne.9 MLINKS+=hashinit.9 hashdestroy.9 \ hashinit.9 hashinit_flags.9 \ hashinit.9 phashinit.9 MLINKS+=ieee80211.9 ieee80211_ifattach.9 \ ieee80211.9 ieee80211_ifdetach.9 MLINKS+=ieee80211_amrr.9 ieee80211_amrr_init.9 \ ieee80211_amrr.9 ieee80211_amrr_cleanup.9 \ ieee80211_amrr.9 ieee80211_amrr_setinterval.9 \ ieee80211_amrr.9 ieee80211_amrr_node_init.9 \ ieee80211_amrr.9 ieee80211_amrr_tx_complete.9 \ ieee80211_amrr.9 ieee80211_amrr_tx_update.9 MLINKS+=ieee80211_beacon.9 ieee80211_beacon_alloc.9 \ ieee80211_beacon.9 ieee80211_beacon_update.9 \ ieee80211_beacon.9 ieee80211_beacon_notify.9 MLINKS+=ieee80211_bmiss.9 ieee80211_beacon_miss.9 MLINKS+=ieee80211_crypto.9 ieee80211_key_update_begin.9 \ ieee80211_crypto.9 ieee80211_key_update_end.9 \ ieee80211_crypto.9 ieee80211_crypto_newkey.9 \ ieee80211_crypto.9 ieee80211_crypto_setkey.9 \ ieee80211_crypto.9 ieee80211_crypto_delglobalkeys.9 \ ieee80211_crypto.9 ieee80211_crypto_reload_keys.9 \ ieee80211_crypto.9 ieee80211_crypto_decap.9 \ ieee80211_crypto.9 ieee80211_crypto_encap.9 \ ieee80211_crypto.9 ieee80211_crypto_demic.9 \ ieee80211_crypto.9 ieee80211_crypto_enmic.9 \ ieee80211_crypto.9 ieee80211_notify_michael_failure.9 \ ieee80211_crypto.9 ieee80211_notify_replay_failure.9 \ ieee80211_crypto.9 ieee80211_crypto_register.9 \ ieee80211_crypto.9 ieee80211_crypto_unregister.9 \ ieee80211_crypto.9 ieee80211_crypto_available.9 MLINKS+=ieee80211_input.9 ieee80211_input_all.9 MLINKS+=ieee80211_node.9 ieee80211_find_rxnode.9 \ ieee80211_node.9 ieee80211_find_rxnode_withkey.9 \ ieee80211_node.9 ieee80211_ref_node.9 \ ieee80211_node.9 ieee80211_unref_node.9 \ ieee80211_node.9 ieee80211_free_node.9 \ ieee80211_node.9 ieee80211_iterate_nodes.9 \ ieee80211_node.9 ieee80211_dump_node.9 \ ieee80211_node.9 ieee80211_dump_nodes.9 MLINKS+=ieee80211_output.9 M_WME_GETAC.9 \ ieee80211_output.9 M_SEQNO_GET.9 \ ieee80211_output.9 ieee80211_process_callback.9 MLINKS+=ieee80211_proto.9 ieee80211_new_state.9 \ ieee80211_proto.9 ieee80211_start_all.9 \ ieee80211_proto.9 ieee80211_stop_all.9 \ ieee80211_proto.9 ieee80211_suspend_all.9 \ ieee80211_proto.9 ieee80211_resume_all.9 \ ieee80211_proto.9 ieee80211_waitfor_parent.9 MLINKS+=ieee80211_radiotap.9 radiotap.9 \ ieee80211_radiotap.9 ieee80211_radiotap_attach.9 \ ieee80211_radiotap.9 ieee80211_radiotap_active_vap.9 \ ieee80211_radiotap.9 ieee80211_radiotap_active.9 \ ieee80211_radiotap.9 ieee80211_radiotap_tx.9 MLINKS+=ieee80211_regdomain.9 ieee80211_init_channels.9 \ ieee80211_regdomain.9 ieee80211_sort_channels.9 \ ieee80211_regdomain.9 ieee80211_alloc_countryie.9 MLINKS+=ieee80211_vap.9 ieee80211_vap_setup.9 \ ieee80211_vap.9 ieee80211_vap_attach.9 \ ieee80211_vap.9 ieee80211_vap_detach.9 MLINKS+=ifnet.9 ifaddr.9 \ ifnet.9 if_data.9 \ ifnet.9 ifqueue.9 MLINKS+=insmntque.9 insmntque1.9 MLINKS+=ithread.9 ithread_add_handler.9 \ ithread.9 ithread_create.9 \ ithread.9 ithread_destroy.9 \ ithread.9 ithread_priority.9 \ ithread.9 ithread_remove_handler.9 \ ithread.9 ithread_schedule.9 MLINKS+=kernacc.9 useracc.9 MLINKS+=kernel_mount.9 free_mntarg.9 \ kernel_mount.9 kernel_vmount.9 \ kernel_mount.9 mount_arg.9 \ kernel_mount.9 mount_argb.9 \ kernel_mount.9 mount_argf.9 \ kernel_mount.9 mount_argsu.9 MLINKS+=kobj.9 DEFINE_CLASS.9 \ kobj.9 kobj_class_compile.9 \ kobj.9 kobj_class_compile_static.9 \ kobj.9 kobj_class_free.9 \ kobj.9 kobj_create.9 \ kobj.9 kobj_delete.9 \ kobj.9 kobj_init.9 MLINKS+=kproc.9 kproc_create.9 \ kproc.9 kthread_create.9 \ kproc.9 kproc_exit.9 \ kproc.9 kproc_resume.9 \ kproc.9 kproc_shutdown.9 \ kproc.9 kproc_start.9 \ kproc.9 kproc_suspend.9 \ kproc.9 kproc_suspend_check.9 MLINKS+=kqueue.9 knlist_add.9 \ kqueue.9 knlist_clear.9 \ kqueue.9 knlist_delete.9 \ kqueue.9 knlist_destroy.9 \ kqueue.9 knlist_empty.9 \ kqueue.9 knlist_init.9 \ kqueue.9 knlist_remove.9 \ kqueue.9 knlist_remove_inevent.9 \ kqueue.9 knote_fdclose.9 \ kqueue.9 KNOTE_LOCKED.9 \ kqueue.9 KNOTE_UNLOCKED.9 \ kqueue.9 kqfd_register.9 \ kqueue.9 kqueue_add_filteropts.9 \ kqueue.9 kqueue_del_filteropts.9 MLINKS+=kthread.9 kthread_add.9 \ kthread.9 kthread_exit.9 \ kthread.9 kthread_resume.9 \ kthread.9 kthread_shutdown.9 \ kthread.9 kthread_start.9 \ kthread.9 kthread_suspend.9 \ kthread.9 kthread_suspend_check.9 MLINKS+=ktr.9 CTR0.9 \ ktr.9 CTR1.9 \ ktr.9 CTR2.9 \ ktr.9 CTR3.9 \ ktr.9 CTR4.9 \ ktr.9 CTR5.9 MLINKS+=lock.9 lockdestroy.9 \ lock.9 lockinit.9 \ lock.9 lockmgr.9 \ lock.9 lockmgr_args.9 \ lock.9 lockmgr_args_rw.9 \ lock.9 lockmgr_assert.9 \ lock.9 lockmgr_disown.9 \ lock.9 lockmgr_printinfo.9 \ lock.9 lockmgr_recursed.9 \ lock.9 lockmgr_rw.9 \ lock.9 lockmgr_waiters.9 \ lock.9 lockstatus.9 MLINKS+=LOCK_PROFILING.9 MUTEX_PROFILING.9 MLINKS+=make_dev.9 destroy_dev.9 \ make_dev.9 dev_depends.9 \ make_dev.9 make_dev_alias.9 MLINKS+=malloc.9 free.9 \ malloc.9 MALLOC_DECLARE.9 \ malloc.9 MALLOC_DEFINE.9 \ malloc.9 realloc.9 \ malloc.9 reallocf.9 MLINKS+=mbchain.9 mb_detach.9 \ mbchain.9 mb_done.9 \ mbchain.9 mb_fixhdr.9 \ mbchain.9 mb_init.9 \ mbchain.9 mb_initm.9 \ mbchain.9 mb_put_int64be.9 \ mbchain.9 mb_put_int64le.9 \ mbchain.9 mb_put_mbuf.9 \ mbchain.9 mb_put_mem.9 \ mbchain.9 mb_put_uint16be.9 \ mbchain.9 mb_put_uint16le.9 \ mbchain.9 mb_put_uint32be.9 \ mbchain.9 mb_put_uint32le.9 \ mbchain.9 mb_put_uint8.9 \ mbchain.9 mb_put_uio.9 \ mbchain.9 mb_reserve.9 MLINKS+=mbpool.9 mbp_alloc.9 \ mbpool.9 mbp_card_free.9 \ mbpool.9 mbp_count.9 \ mbpool.9 mbp_create.9 \ mbpool.9 mbp_destroy.9 \ mbpool.9 mbp_ext_free.9 \ mbpool.9 mbp_free.9 \ mbpool.9 mbp_get.9 \ mbpool.9 mbp_get_keep.9 \ mbpool.9 mbp_sync.9 MLINKS+=\ mbuf.9 m_adj.9 \ mbuf.9 M_ALIGN.9 \ mbuf.9 m_apply.9 \ mbuf.9 m_cat.9 \ mbuf.9 MCHTYPE.9 \ mbuf.9 MCLGET.9 \ mbuf.9 m_copyback.9 \ mbuf.9 m_copydata.9 \ mbuf.9 m_copym.9 \ mbuf.9 m_copypacket.9 \ mbuf.9 m_defrag.9 \ mbuf.9 m_devget.9 \ mbuf.9 m_dup.9 \ mbuf.9 m_dup_pkthdr.9 \ mbuf.9 MEXTADD.9 \ mbuf.9 MEXT_ADD_REF.9 \ mbuf.9 MEXTFREE.9 \ mbuf.9 MEXT_IS_REF.9 \ mbuf.9 MEXT_REM_REF.9 \ mbuf.9 m_fixhdr.9 \ mbuf.9 MFREE.9 \ mbuf.9 m_free.9 \ mbuf.9 m_freem.9 \ mbuf.9 MGET.9 \ mbuf.9 m_get.9 \ mbuf.9 m_getcl.9 \ mbuf.9 m_getclr.9 \ mbuf.9 MGETHDR.9 \ mbuf.9 m_gethdr.9 \ mbuf.9 m_getm.9 \ mbuf.9 m_getptr.9 \ mbuf.9 MH_ALIGN.9 \ mbuf.9 M_LEADINGSPACE.9 \ mbuf.9 m_length.9 \ mbuf.9 M_MOVE_PKTHDR.9 \ mbuf.9 m_move_pkthdr.9 \ mbuf.9 M_PREPEND.9 \ mbuf.9 m_prepend.9 \ mbuf.9 m_pullup.9 \ mbuf.9 m_split.9 \ mbuf.9 mtod.9 \ mbuf.9 M_TRAILINGSPACE.9 \ mbuf.9 M_WRITABLE.9 MLINKS+=MD5.9 MD5Init.9 \ MD5.9 MD5Transform.9 MLINKS+=mdchain.9 md_append_record.9 \ mdchain.9 md_done.9 \ mdchain.9 md_get_int64.9 \ mdchain.9 md_get_int64be.9 \ mdchain.9 md_get_int64le.9 \ mdchain.9 md_get_mbuf.9 \ mdchain.9 md_get_mem.9 \ mdchain.9 md_get_uint16.9 \ mdchain.9 md_get_uint16be.9 \ mdchain.9 md_get_uint16le.9 \ mdchain.9 md_get_uint32.9 \ mdchain.9 md_get_uint32be.9 \ mdchain.9 md_get_uint32le.9 \ mdchain.9 md_get_uint8.9 \ mdchain.9 md_get_uio.9 \ mdchain.9 md_initm.9 \ mdchain.9 md_next_record.9 MLINKS+=microtime.9 bintime.9 \ microtime.9 getbintime.9 \ microtime.9 getmicrotime.9 \ microtime.9 getnanotime.9 \ microtime.9 nanotime.9 MLINKS+=microuptime.9 binuptime.9 \ microuptime.9 getbinuptime.9 \ microuptime.9 getmicrouptime.9 \ microuptime.9 getnanouptime.9 \ microuptime.9 nanouptime.9 MLINKS+=mi_switch.9 cpu_switch.9 \ mi_switch.9 cpu_throw.9 MLINKS+=mtx_pool.9 mtx_pool_alloc.9 \ mtx_pool.9 mtx_pool_create.9 \ mtx_pool.9 mtx_pool_destroy.9 \ mtx_pool.9 mtx_pool_find.9 \ mtx_pool.9 mtx_pool_lock.9 \ mtx_pool.9 mtx_pool_lock_spin.9 \ mtx_pool.9 mtx_pool_unlock.9 \ mtx_pool.9 mtx_pool_unlock_spin.9 MLINKS+=mutex.9 mtx_assert.9 \ mutex.9 mtx_destroy.9 \ mutex.9 mtx_init.9 \ mutex.9 mtx_initialized.9 \ mutex.9 mtx_lock.9 \ mutex.9 mtx_lock_flags.9 \ mutex.9 mtx_lock_spin.9 \ mutex.9 mtx_lock_spin_flags.9 \ mutex.9 mtx_owned.9 \ mutex.9 mtx_recursed.9 \ mutex.9 mtx_sleep.9 \ mutex.9 MTX_SYSINIT.9 \ mutex.9 mtx_trylock.9 \ mutex.9 mtx_trylock_flags.9 \ mutex.9 mtx_unlock.9 \ mutex.9 mtx_unlock_flags.9 \ mutex.9 mtx_unlock_spin.9 \ mutex.9 mtx_unlock_spin_flags.9 MLINKS+=namei.9 NDFREE.9 \ namei.9 NDHASGIANT.9 \ namei.9 NDINIT.9 MLINKS+=pbuf.9 getpbuf.9 \ pbuf.9 relpbuf.9 \ pbuf.9 trypbuf.9 MLINKS+=pci.9 pci_disable_busmaster.9 \ pci.9 pci_disable_io.9 \ pci.9 pci_enable_busmaster.9 \ pci.9 pci_enable_io.9 \ pci.9 pci_find_bsf.9 \ pci.9 pci_find_dbsf.9 \ pci.9 pci_find_device.9 \ pci.9 pci_get_powerstate.9 \ pci.9 pci_read_config.9 \ pci.9 pci_set_powerstate.9 \ pci.9 pci_write_config.9 MLINKS+=pfil.9 pfil_add_hook.9 \ pfil.9 pfil_hook_get.9 \ pfil.9 pfil_remove_hook.9 MLINKS+=pfind.9 zpfind.9 MLINKS+=pmap_clear_modify.9 pmap_clear_reference.9 MLINKS+=pmap_copy.9 pmap_copy_page.9 MLINKS+=pmap_extract.9 pmap_extract_and_hold.9 MLINKS+=pmap_init.9 pmap_init2.9 MLINKS+=pmap_is_modified.9 pmap_ts_modified.9 MLINKS+=pmap_page_protect.9 pmap_protect.9 MLINKS+=pmap_pinit.9 pmap_pinit0.9 \ pmap_pinit.9 pmap_pinit2.9 MLINKS+=pmap_qenter.9 pmap_qremove.9 MLINKS+=pmap_remove.9 pmap_remove_all.9 \ pmap_remove.9 pmap_remove_pages.9 MLINKS+=pmap_resident_count.9 pmap_wired_count.9 MLINKS+=pmap_zero_page.9 pmap_zero_area.9 \ pmap_zero_page.9 pmap_zero_idle.9 MLINKS+=printf.9 log.9 \ printf.9 tprintf.9 \ printf.9 uprintf.9 MLINKS+=priv.9 priv_check.9 \ priv.9 priv_check_cred.9 MLINKS+=psignal.9 gsignal.9 \ psignal.9 pgsignal.9 MLINKS+=random.9 arc4rand.9 \ random.9 arc4random.9 \ random.9 read_random.9 \ random.9 srandom.9 MLINKS+=refcount.9 refcount_acquire.9 \ refcount.9 refcount_init.9 \ refcount.9 refcount_release.9 MLINKS+=resource_int_value.9 resource_long_value.9 \ resource_int_value.9 resource_string_value.9 MLINKS+=rman.9 rman_activate_resource.9 \ rman.9 rman_adjust_resource.9 \ rman.9 rman_await_resource.9 \ rman.9 rman_deactivate_resource.9 \ rman.9 rman_fini.9 \ rman.9 rman_first_free_region.9 \ rman.9 rman_get_bushandle.9 \ rman.9 rman_get_bustag.9 \ rman.9 rman_get_device.9 \ rman.9 rman_get_end.9 \ rman.9 rman_get_flags.9 \ rman.9 rman_get_rid.9 \ rman.9 rman_get_size.9 \ rman.9 rman_get_start.9 \ rman.9 rman_get_virtual.9 \ rman.9 rman_init.9 \ rman.9 rman_init_from_resource.9 \ rman.9 rman_is_region_manager.9 \ rman.9 rman_last_free_region.9 \ rman.9 rman_make_alignment_flags.9 \ rman.9 rman_manage_region.9 \ rman.9 rman_release_resource.9 \ rman.9 rman_reserve_resource.9 \ rman.9 rman_reserve_resource_bound.9 \ rman.9 rman_set_bushandle.9 \ rman.9 rman_set_bustag.9 \ rman.9 rman_set_rid.9 \ rman.9 rman_set_virtual.9 MLINKS+=rmlock.9 rm_destroy.9 \ rmlock.9 rm_init.9 \ rmlock.9 rm_rlock.9 \ rmlock.9 rm_runlock.9 \ rmlock.9 RM_SYSINIT.9 \ rmlock.9 rm_wlock.9 \ rmlock.9 rm_wowned.9 \ rmlock.9 rm_wunlock.9 MLINKS+=rtalloc.9 rtalloc1.9 \ rtalloc.9 rtalloc_ign.9 \ rtalloc.9 RTFREE.9 \ rtalloc.9 rtfree.9 MLINKS+=runqueue.9 chooseproc.9 \ runqueue.9 procrunnable.9 \ runqueue.9 remrunqueue.9 \ runqueue.9 setrunqueue.9 MLINKS+=rwlock.9 rw_assert.9 \ rwlock.9 rw_destroy.9 \ rwlock.9 rw_downgrade.9 \ rwlock.9 rw_init.9 \ rwlock.9 rw_initialized.9 \ rwlock.9 rw_rlock.9 \ rwlock.9 rw_runlock.9 \ rwlock.9 rw_sleep.9 \ rwlock.9 RW_SYSINIT.9 \ rwlock.9 rw_try_rlock.9 \ rwlock.9 rw_try_upgrade.9 \ rwlock.9 rw_try_wlock.9 \ rwlock.9 rw_wlock.9 \ rwlock.9 rw_wowned.9 \ rwlock.9 rw_wunlock.9 MLINKS+=sbuf.9 sbuf_bcat.9 \ sbuf.9 sbuf_bcopyin.9 \ sbuf.9 sbuf_bcpy.9 \ sbuf.9 sbuf_cat.9 \ sbuf.9 sbuf_clear.9 \ sbuf.9 sbuf_copyin.9 \ sbuf.9 sbuf_cpy.9 \ sbuf.9 sbuf_data.9 \ sbuf.9 sbuf_delete.9 \ sbuf.9 sbuf_done.9 \ sbuf.9 sbuf_finish.9 \ sbuf.9 sbuf_len.9 \ sbuf.9 sbuf_new.9 \ sbuf.9 sbuf_new_for_sysctl.9 \ sbuf.9 sbuf_overflowed.9 \ sbuf.9 sbuf_printf.9 \ sbuf.9 sbuf_putc.9 \ sbuf.9 sbuf_set_drain.9 \ sbuf.9 sbuf_setpos.9 \ sbuf.9 sbuf_trim.9 \ sbuf.9 sbuf_vprintf.9 MLINKS+=scheduler.9 curpriority_cmp.9 \ scheduler.9 maybe_resched.9 \ scheduler.9 resetpriority.9 \ scheduler.9 roundrobin.9 \ scheduler.9 roundrobin_interval.9 \ scheduler.9 schedclock.9 \ scheduler.9 schedcpu.9 \ scheduler.9 sched_setup.9 \ scheduler.9 setrunnable.9 \ scheduler.9 updatepri.9 MLINKS+=securelevel_gt.9 securelevel_ge.9 -MLINKS+=selrecord.9 selwakeup.9 +MLINKS+=selrecord.9 seldrain.9 \ + selrecord.9 selwakeup.9 MLINKS+=sema.9 sema_destroy.9 \ sema.9 sema_init.9 \ sema.9 sema_post.9 \ sema.9 sema_timedwait.9 \ sema.9 sema_trywait.9 \ sema.9 sema_value.9 \ sema.9 sema_wait.9 MLINKS+=sf_buf.9 sf_buf_alloc.9 \ sf_buf.9 sf_buf_free.9 \ sf_buf.9 sf_buf_kva.9 \ sf_buf.9 sf_buf_page.9 MLINKS+=sglist.9 sglist_alloc.9 \ sglist.9 sglist_append.9 \ sglist.9 sglist_append_mbuf.9 \ sglist.9 sglist_append_phys.9 \ sglist.9 sglist_append_uio.9 \ sglist.9 sglist_append_user.9 \ sglist.9 sglist_build.9 \ sglist.9 sglist_clone.9 \ sglist.9 sglist_consume_uio.9 \ sglist.9 sglist_count.9 \ sglist.9 sglist_free.9 \ sglist.9 sglist_hold.9 \ sglist.9 sglist_init.9 \ sglist.9 sglist_join.9 \ sglist.9 sglist_length.9 \ sglist.9 sglist_reset.9 \ sglist.9 sglist_slice.9 \ sglist.9 sglist_split.9 MLINKS+=signal.9 cursig.9 \ signal.9 execsigs.9 \ signal.9 issignal.9 \ signal.9 killproc.9 \ signal.9 pgsigio.9 \ signal.9 postsig.9 \ signal.9 SETSETNEQ.9 \ signal.9 SETSETOR.9 \ signal.9 SIGADDSET.9 \ signal.9 SIG_CONTSIGMASK.9 \ signal.9 SIGDELSET.9 \ signal.9 SIGEMPTYSET.9 \ signal.9 sigexit.9 \ signal.9 SIGFILLSET.9 \ signal.9 siginit.9 \ signal.9 SIGISEMPTY.9 \ signal.9 SIGISMEMBER.9 \ signal.9 SIGNOTEMPTY.9 \ signal.9 signotify.9 \ signal.9 SIGPENDING.9 \ signal.9 SIGSETAND.9 \ signal.9 SIGSETCANTMASK.9 \ signal.9 SIGSETEQ.9 \ signal.9 SIGSETNAND.9 \ signal.9 SIG_STOPSIGMASK.9 \ signal.9 trapsignal.9 MLINKS+=sleep.9 msleep.9 \ sleep.9 msleep_spin.9 \ sleep.9 pause.9 \ sleep.9 tsleep.9 \ sleep.9 wakeup.9 \ sleep.9 wakeup_one.9 MLINKS+=sleepqueue.9 init_sleepqueues.9 \ sleepqueue.9 sleepq_abort.9 \ sleepqueue.9 sleepq_add.9 \ sleepqueue.9 sleepq_alloc.9 \ sleepqueue.9 sleepq_broadcast.9 \ sleepqueue.9 sleepq_calc_signal_retval.9 \ sleepqueue.9 sleepq_catch_signals.9 \ sleepqueue.9 sleepq_free.9 \ sleepqueue.9 sleepq_lookup.9 \ sleepqueue.9 sleepq_release.9 \ sleepqueue.9 sleepq_remove.9 \ sleepqueue.9 sleepq_set_timeout.9 \ sleepqueue.9 sleepq_signal.9 \ sleepqueue.9 sleepq_timedwait.9 \ sleepqueue.9 sleepq_timedwait_sig.9 \ sleepqueue.9 sleepq_wait.9 \ sleepqueue.9 sleepq_wait_sig.9 MLINKS+=socket.9 sobind.9 \ socket.9 soclose.9 \ socket.9 soconnect.9 \ socket.9 socreate.9 \ socket.9 sogetopt.9 \ socket.9 soreceive.9 \ socket.9 sosend.9 \ socket.9 sosetopt.9 \ socket.9 soshutdown.9 MLINKS+=spl.9 spl0.9 \ spl.9 splbio.9 \ spl.9 splclock.9 \ spl.9 splhigh.9 \ spl.9 splimp.9 \ spl.9 splnet.9 \ spl.9 splsoftclock.9 \ spl.9 splsofttty.9 \ spl.9 splstatclock.9 \ spl.9 spltty.9 \ spl.9 splvm.9 \ spl.9 splx.9 MLINKS+=stack.9 stack_copy.9 \ stack.9 stack_create.9 \ stack.9 stack_destroy.9 \ stack.9 stack_print.9 \ stack.9 stack_print_ddb.9 \ stack.9 stack_put.9 \ stack.9 stack_save.9 \ stack.9 stack_sbuf_print.9 \ stack.9 stack_sbuf_print_ddb.9 \ stack.9 stack_zero.9 MLINKS+=store.9 subyte.9 \ store.9 suswintr.9 \ store.9 suword.9 \ store.9 suword16.9 \ store.9 suword32.9 \ store.9 suword64.9 MLINKS+=swi.9 swi_add.9 \ swi.9 swi_sched.9 MLINKS+=sx.9 sx_assert.9 \ sx.9 sx_destroy.9 \ sx.9 sx_downgrade.9 \ sx.9 sx_init.9 \ sx.9 sx_init_flags.9 \ sx.9 sx_sleep.9 \ sx.9 sx_slock.9 \ sx.9 sx_sunlock.9 \ sx.9 SX_SYSINIT.9 \ sx.9 sx_try_slock.9 \ sx.9 sx_try_upgrade.9 \ sx.9 sx_try_xlock.9 \ sx.9 sx_unlock.9 \ sx.9 sx_xholder.9 \ sx.9 sx_xlock.9 \ sx.9 sx_xlocked.9 \ sx.9 sx_xunlock.9 MLINKS+=sysctl.9 SYSCTL_DECL.9 \ sysctl.9 SYSCTL_INT.9 \ sysctl.9 SYSCTL_LONG.9 \ sysctl.9 SYSCTL_NODE.9 \ sysctl.9 SYSCTL_OPAQUE.9 \ sysctl.9 SYSCTL_PROC.9 \ sysctl.9 SYSCTL_STRING.9 \ sysctl.9 SYSCTL_STRUCT.9 \ sysctl.9 SYSCTL_UINT.9 \ sysctl.9 SYSCTL_ULONG.9 \ sysctl.9 SYSCTL_XINT.9 \ sysctl.9 SYSCTL_XLONG.9 MLINKS+=sysctl_add_oid.9 SYSCTL_ADD_INT.9 \ sysctl_add_oid.9 SYSCTL_ADD_LONG.9 \ sysctl_add_oid.9 SYSCTL_ADD_NODE.9 \ sysctl_add_oid.9 SYSCTL_ADD_OID.9 \ sysctl_add_oid.9 SYSCTL_ADD_OPAQUE.9 \ sysctl_add_oid.9 SYSCTL_ADD_PROC.9 \ sysctl_add_oid.9 SYSCTL_ADD_STRING.9 \ sysctl_add_oid.9 SYSCTL_ADD_STRUCT.9 \ sysctl_add_oid.9 SYSCTL_ADD_UINT.9 \ sysctl_add_oid.9 SYSCTL_ADD_ULONG.9 \ sysctl_add_oid.9 SYSCTL_CHILDREN.9 \ sysctl_add_oid.9 sysctl_move_oid.9 \ sysctl_add_oid.9 sysctl_remove_oid.9 \ sysctl_add_oid.9 SYSCTL_STATIC_CHILDREN.9 MLINKS+=sysctl_ctx_init.9 sysctl_ctx_entry_add.9 \ sysctl_ctx_init.9 sysctl_ctx_entry_del.9 \ sysctl_ctx_init.9 sysctl_ctx_entry_find.9 \ sysctl_ctx_init.9 sysctl_ctx_free.9 MLINKS+=taskqueue.9 TASK_INIT.9 \ taskqueue.9 taskqueue_create.9 \ taskqueue.9 TASKQUEUE_DECLARE.9 \ taskqueue.9 TASKQUEUE_DEFINE.9 \ taskqueue.9 taskqueue_enqueue.9 \ taskqueue.9 taskqueue_find.9 \ taskqueue.9 taskqueue_free.9 \ taskqueue.9 taskqueue_run.9 MLINKS+=time.9 boottime.9 \ time.9 time_second.9 \ time.9 time_uptime.9 MLINKS+=timeout.9 callout.9 \ timeout.9 callout_active.9 \ timeout.9 callout_deactivate.9 \ timeout.9 callout_drain.9 \ timeout.9 callout_handle_init.9 \ timeout.9 callout_init.9 \ timeout.9 callout_init_mtx.9 \ timeout.9 callout_init_rw.9 \ timeout.9 callout_pending.9 \ timeout.9 callout_reset.9 \ timeout.9 callout_schedule.9 \ timeout.9 callout_stop.9 \ timeout.9 untimeout.9 MLINKS+=ucred.9 crcopy.9 \ ucred.9 crdup.9 \ ucred.9 crfree.9 \ ucred.9 crget.9 \ ucred.9 crhold.9 \ ucred.9 crshared.9 \ ucred.9 cru2x.9 MLINKS+=uidinfo.9 uifind.9 \ uidinfo.9 uifree.9 \ uidinfo.9 uihashinit.9 \ uidinfo.9 uihold.9 MLINKS+=uio.9 uiomove.9 MLINKS+=usbdi.9 usb_fifo_alloc_buffer.9 \ usbdi.9 usb_fifo_attach.9 \ usbdi.9 usb_fifo_detach.9 \ usbdi.9 usb_fifo_free_buffer.9 \ usbdi.9 usb_fifo_get_data.9 \ usbdi.9 usb_fifo_get_data_buffer.9 \ usbdi.9 usb_fifo_get_data_error.9 \ usbdi.9 usb_fifo_get_data_linear.9 \ usbdi.9 usb_fifo_put_bytes_max.9 \ usbdi.9 usb_fifo_put_data.9 \ usbdi.9 usb_fifo_put_data_buffer.9 \ usbdi.9 usb_fifo_put_data_error.9 \ usbdi.9 usb_fifo_put_data_linear.9 \ usbdi.9 usb_fifo_reset.9 \ usbdi.9 usb_fifo_softc.9 \ usbdi.9 usb_fifo_wakeup.9 \ usbdi.9 usbd_do_request.9 \ usbdi.9 usbd_do_request_flags.9 \ usbdi.9 usbd_errstr.9 \ usbdi.9 usbd_lookup_id_by_info.9 \ usbdi.9 usbd_lookup_id_by_uaa.9 \ usbdi.9 usbd_transfer_clear_stall.9 \ usbdi.9 usbd_transfer_drain.9 \ usbdi.9 usbd_transfer_pending.9 \ usbdi.9 usbd_transfer_poll.9 \ usbdi.9 usbd_transfer_setup.9 \ usbdi.9 usbd_transfer_start.9 \ usbdi.9 usbd_transfer_stop.9 \ usbdi.9 usbd_transfer_submit.9 \ usbdi.9 usbd_transfer_unsetup.9 \ usbdi.9 usbd_xfer_clr_flag.9 \ usbdi.9 usbd_xfer_frame_data.9 \ usbdi.9 usbd_xfer_frame_len.9 \ usbdi.9 usbd_xfer_get_frame.9 \ usbdi.9 usbd_xfer_get_priv.9 \ usbdi.9 usbd_xfer_is_stalled.9 \ usbdi.9 usbd_xfer_max_framelen.9 \ usbdi.9 usbd_xfer_max_frames.9 \ usbdi.9 usbd_xfer_max_len.9 \ usbdi.9 usbd_xfer_set_flag.9 \ usbdi.9 usbd_xfer_set_frame_data.9 \ usbdi.9 usbd_xfer_set_frame_len.9 \ usbdi.9 usbd_xfer_set_frame_offset.9 \ usbdi.9 usbd_xfer_set_frames.9 \ usbdi.9 usbd_xfer_set_interval.9 \ usbdi.9 usbd_xfer_set_priv.9 \ usbdi.9 usbd_xfer_set_stall.9 \ usbdi.9 usbd_xfer_set_timeout.9 \ usbdi.9 usbd_xfer_softc.9 \ usbdi.9 usbd_xfer_state.9 \ usbdi.9 usbd_xfer_state.9 \ usbdi.9 usbd_xfer_status.9 MLINKS+=vcount.9 count_dev.9 MLINKS+=vfsconf.9 vfs_modevent.9 \ vfsconf.9 vfs_register.9 \ vfsconf.9 vfs_unregister.9 MLINKS+=vfs_getopt.9 vfs_copyopt.9 \ vfs_getopt.9 vfs_filteropt.9 \ vfs_getopt.9 vfs_flagopt.9 \ vfs_getopt.9 vfs_getopts.9 \ vfs_getopt.9 vfs_scanopt.9 \ vfs_getopt.9 vfs_setopt.9 \ vfs_getopt.9 vfs_setopt_part.9 \ vfs_getopt.9 vfs_setopts.9 MLINKS+=VFS_LOCK_GIANT.9 VFS_UNLOCK_GIANT.9 MLINKS+=vhold.9 vdrop.9 \ vhold.9 vholdl.9 \ vhold.9 vdropl.9 MLINKS+=vm_map_lock.9 vm_map_lock_downgrade.9 \ vm_map_lock.9 vm_map_lock_read.9 \ vm_map_lock.9 vm_map_lock_upgrade.9 \ vm_map_lock.9 vm_map_trylock.9 \ vm_map_lock.9 vm_map_trylock_read.9 \ vm_map_lock.9 vm_map_unlock.9 \ vm_map_lock.9 vm_map_unlock_read.9 MLINKS+=vm_map_lookup.9 vm_map_lookup_done.9 MLINKS+=vm_map_max.9 vm_map_min.9 \ vm_map_max.9 vm_map_pmap.9 MLINKS+=vm_map_stack.9 vm_map_growstack.9 MLINKS+=vm_map_wire.9 vm_map_unwire.9 MLINKS+=vm_page_bits.9 vm_page_clear_dirty.9 \ vm_page_bits.9 vm_page_dirty.9 \ vm_page_bits.9 vm_page_is_valid.9 \ vm_page_bits.9 vm_page_set_invalid.9 \ vm_page_bits.9 vm_page_set_validclean.9 \ vm_page_bits.9 vm_page_test_dirty.9 \ vm_page_bits.9 vm_page_undirty.9 \ vm_page_bits.9 vm_page_zero_invalid.9 MLINKS+=vm_page_flag.9 vm_page_flag_clear.9 \ vm_page_flag.9 vm_page_flag_set.9 MLINKS+=vm_page_free.9 vm_page_free_toq.9 \ vm_page_free.9 vm_page_free_zero.9 \ vm_page_free.9 vm_page_try_to_free.9 MLINKS+=vm_page_hold.9 vm_page_unhold.9 MLINKS+=vm_page_insert.9 vm_page_remove.9 MLINKS+=vm_page_io.9 vm_page_io_finish.9 \ vm_page_io.9 vm_page_io_start.9 MLINKS+=vm_page_wakeup.9 vm_page_busy.9 \ vm_page_wakeup.9 vm_page_flash.9 MLINKS+=vm_page_wire.9 vm_page_unwire.9 MLINKS+=VOP_ACCESS.9 VOP_ACCESSX.9 MLINKS+=VOP_ATTRIB.9 VOP_GETATTR.9 \ VOP_ATTRIB.9 VOP_SETATTR.9 MLINKS+=VOP_CREATE.9 VOP_MKDIR.9 \ VOP_CREATE.9 VOP_MKNOD.9 \ VOP_CREATE.9 VOP_SYMLINK.9 MLINKS+=VOP_GETPAGES.9 VOP_PUTPAGES.9 MLINKS+=VOP_GETVOBJECT.9 VOP_CREATEVOBJECT.9 \ VOP_GETVOBJECT.9 VOP_DESTROYVOBJECT.9 MLINKS+=VOP_INACTIVE.9 VOP_RECLAIM.9 MLINKS+=VOP_LOCK.9 vn_lock.9 \ VOP_LOCK.9 VOP_ISLOCKED.9 \ VOP_LOCK.9 VOP_UNLOCK.9 MLINKS+=VOP_OPENCLOSE.9 VOP_CLOSE.9 \ VOP_OPENCLOSE.9 VOP_OPEN.9 MLINKS+=VOP_RDWR.9 VOP_READ.9 \ VOP_RDWR.9 VOP_WRITE.9 MLINKS+=VOP_REMOVE.9 VOP_RMDIR.9 MLINKS+=vref.9 VREF.9 MLINKS+=vrele.9 vput.9 \ vrele.9 vunref.9 MLINKS+=vslock.9 vsunlock.9 MLINKS+=zero_copy.9 zero_copy_sockets.9 MLINKS+=zone.9 uma.9 \ zone.9 uma_zalloc.9 \ zone.9 uma_zcreate.9 \ zone.9 uma_zdestroy.9 \ zone.9 uma_zfree.9 \ zone.9 uma_zone_set_max.9 .include Index: stable/8/share/man/man9/selrecord.9 =================================================================== --- stable/8/share/man/man9/selrecord.9 (revision 225584) +++ stable/8/share/man/man9/selrecord.9 (revision 225585) @@ -1,109 +1,126 @@ .\" .\" Copyright (C) 2002 Chad David . All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd June 13, 2007 +.Dd August 25, 2011 .Dt SELRECORD 9 .Os .Sh NAME +.Nm seldrain , .Nm selrecord , .Nm selwakeup .Nd "record and wakeup select requests" .Sh SYNOPSIS .In sys/param.h .In sys/selinfo.h .Ft void +.Fn seldrain "struct selinfo *sip" +.Ft void .Fn selrecord "struct thread *td" "struct selinfo *sip" .Ft void .Fn selwakeup "struct selinfo *sip" .Sh DESCRIPTION +.Fn seldrain , .Fn selrecord and .Fn selwakeup -are the two central functions used by +are the three central functions used by .Xr select 2 , .Xr poll 2 and the objects that are being selected on. They handle the task of recording which threads are waiting on which objects and the waking of the proper threads when an event of interest occurs on an object. .Pp .Fn selrecord records that the calling thread is interested in events related to a given object. If another thread is already waiting on the object a collision will be flagged in .Fa sip which will be later dealt with by .Fn selwakeup . .Pp .Fn selrecord acquires and releases .Va sellock . .Pp .Fn selwakeup is called by the underlying object handling code in order to notify any waiting threads that an event of interest has occurred. If a collision has occurred, .Fn selwakeup will increment .Va nselcoll , and broadcast on the global cv in order to wake all waiting threads so that they can handle it. If the thread waiting on the object is not currently sleeping or the wait channel is not .Va selwait , .Fn selwakeup will clear the .Dv TDF_SELECT flag which should be noted by .Xr select 2 and .Xr poll 2 when they wake up. .Pp +.Fn seldrain +will flush the waiters queue on a specified object before its +destruction. +The object handling code must ensure that +.Fa *sip +cannot be used once +.Fn seldrain +has been called. +.Pp The contents of .Fa *sip must be zeroed, such as by softc initialization, before any call to .Fn selrecord or .Fn selwakeup , otherwise a panic may occur. .Fn selwakeup acquires and releases .Va sellock and may acquire and release .Va sched_lock . +.Fn seldrain +could usually be just a wrapper for +.Fn selwakeup , +but consumers should not generally rely on this feature. .Sh SEE ALSO .Xr poll 2 , .Xr select 2 .Sh AUTHORS .An -nosplit This manual page was written by .An Chad David Aq davidc@FreeBSD.org and .An Alfred Perlstein Aq alfred@FreeBSD.org . Index: stable/8/share/man/man9 =================================================================== --- stable/8/share/man/man9 (revision 225584) +++ stable/8/share/man/man9 (revision 225585) Property changes on: stable/8/share/man/man9 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/share/man/man9:r225177,225181 Index: stable/8/sys/amd64/include/xen =================================================================== --- stable/8/sys/amd64/include/xen (revision 225584) +++ stable/8/sys/amd64/include/xen (revision 225585) Property changes on: stable/8/sys/amd64/include/xen ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/amd64/include/xen:r225177 Index: stable/8/sys/cddl/contrib/opensolaris =================================================================== --- stable/8/sys/cddl/contrib/opensolaris (revision 225584) +++ stable/8/sys/cddl/contrib/opensolaris (revision 225585) Property changes on: stable/8/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r225177 Index: stable/8/sys/contrib/dev/acpica =================================================================== --- stable/8/sys/contrib/dev/acpica (revision 225584) +++ stable/8/sys/contrib/dev/acpica (revision 225585) Property changes on: stable/8/sys/contrib/dev/acpica ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/contrib/dev/acpica:r225177 Index: stable/8/sys/contrib/pf =================================================================== --- stable/8/sys/contrib/pf (revision 225584) +++ stable/8/sys/contrib/pf (revision 225585) Property changes on: stable/8/sys/contrib/pf ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/contrib/pf:r225177 Index: stable/8/sys/i386/acpica/acpi_machdep.c =================================================================== --- stable/8/sys/i386/acpica/acpi_machdep.c (revision 225584) +++ stable/8/sys/i386/acpica/acpi_machdep.c (revision 225585) @@ -1,837 +1,838 @@ /*- * Copyright (c) 2001 Mitsuru IWASAKI * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * APM driver emulation */ #include #include #include SYSCTL_DECL(_debug_acpi); uint32_t acpi_resume_beep; TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep); SYSCTL_UINT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep, 0, "Beep the PC speaker when resuming"); uint32_t acpi_reset_video; TUNABLE_INT("hw.acpi.reset_video", &acpi_reset_video); static int intr_model = ACPI_INTR_PIC; static int apm_active; static struct clonedevs *apm_clones; MALLOC_DEFINE(M_APMDEV, "apmdev", "APM device emulation"); static d_open_t apmopen; static d_close_t apmclose; static d_write_t apmwrite; static d_ioctl_t apmioctl; static d_poll_t apmpoll; static d_kqfilter_t apmkqfilter; static void apmreadfiltdetach(struct knote *kn); static int apmreadfilt(struct knote *kn, long hint); static struct filterops apm_readfiltops = { 1, NULL, apmreadfiltdetach, apmreadfilt }; static struct cdevsw apm_cdevsw = { .d_version = D_VERSION, .d_flags = D_TRACKCLOSE | D_NEEDMINOR, .d_open = apmopen, .d_close = apmclose, .d_write = apmwrite, .d_ioctl = apmioctl, .d_poll = apmpoll, .d_name = "apm", .d_kqfilter = apmkqfilter }; static int acpi_capm_convert_battstate(struct acpi_battinfo *battp) { int state; state = APM_UNKNOWN; if (battp->state & ACPI_BATT_STAT_DISCHARG) { if (battp->cap >= 50) state = 0; /* high */ else state = 1; /* low */ } if (battp->state & ACPI_BATT_STAT_CRITICAL) state = 2; /* critical */ if (battp->state & ACPI_BATT_STAT_CHARGING) state = 3; /* charging */ /* If still unknown, determine it based on the battery capacity. */ if (state == APM_UNKNOWN) { if (battp->cap >= 50) state = 0; /* high */ else state = 1; /* low */ } return (state); } static int acpi_capm_convert_battflags(struct acpi_battinfo *battp) { int flags; flags = 0; if (battp->cap >= 50) flags |= APM_BATT_HIGH; else { if (battp->state & ACPI_BATT_STAT_CRITICAL) flags |= APM_BATT_CRITICAL; else flags |= APM_BATT_LOW; } if (battp->state & ACPI_BATT_STAT_CHARGING) flags |= APM_BATT_CHARGING; if (battp->state == ACPI_BATT_STAT_NOT_PRESENT) flags = APM_BATT_NOT_PRESENT; return (flags); } static int acpi_capm_get_info(apm_info_t aip) { int acline; struct acpi_battinfo batt; aip->ai_infoversion = 1; aip->ai_major = 1; aip->ai_minor = 2; aip->ai_status = apm_active; aip->ai_capabilities= 0xff00; /* unknown */ if (acpi_acad_get_acline(&acline)) aip->ai_acline = APM_UNKNOWN; /* unknown */ else aip->ai_acline = acline; /* on/off */ if (acpi_battery_get_battinfo(NULL, &batt) != 0) { aip->ai_batt_stat = APM_UNKNOWN; aip->ai_batt_life = APM_UNKNOWN; aip->ai_batt_time = -1; /* unknown */ aip->ai_batteries = ~0U; /* unknown */ } else { aip->ai_batt_stat = acpi_capm_convert_battstate(&batt); aip->ai_batt_life = batt.cap; aip->ai_batt_time = (batt.min == -1) ? -1 : batt.min * 60; aip->ai_batteries = acpi_battery_get_units(); } return (0); } static int acpi_capm_get_pwstatus(apm_pwstatus_t app) { device_t dev; int acline, unit, error; struct acpi_battinfo batt; if (app->ap_device != PMDV_ALLDEV && (app->ap_device < PMDV_BATT0 || app->ap_device > PMDV_BATT_ALL)) return (1); if (app->ap_device == PMDV_ALLDEV) error = acpi_battery_get_battinfo(NULL, &batt); else { unit = app->ap_device - PMDV_BATT0; dev = devclass_get_device(devclass_find("battery"), unit); if (dev != NULL) error = acpi_battery_get_battinfo(dev, &batt); else error = ENXIO; } if (error) return (1); app->ap_batt_stat = acpi_capm_convert_battstate(&batt); app->ap_batt_flag = acpi_capm_convert_battflags(&batt); app->ap_batt_life = batt.cap; app->ap_batt_time = (batt.min == -1) ? -1 : batt.min * 60; if (acpi_acad_get_acline(&acline)) app->ap_acline = APM_UNKNOWN; else app->ap_acline = acline; /* on/off */ return (0); } /* Create single-use devices for /dev/apm and /dev/apmctl. */ static void apm_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { int ctl_dev, unit; if (*dev != NULL) return; if (strcmp(name, "apmctl") == 0) ctl_dev = TRUE; else if (strcmp(name, "apm") == 0) ctl_dev = FALSE; else return; /* Always create a new device and unit number. */ unit = -1; if (clone_create(&apm_clones, &apm_cdevsw, &unit, dev, 0)) { if (ctl_dev) { *dev = make_dev(&apm_cdevsw, unit, UID_ROOT, GID_OPERATOR, 0660, "apmctl%d", unit); } else { *dev = make_dev(&apm_cdevsw, unit, UID_ROOT, GID_OPERATOR, 0664, "apm%d", unit); } if (*dev != NULL) { dev_ref(*dev); (*dev)->si_flags |= SI_CHEAPCLONE; } } } /* Create a struct for tracking per-device suspend notification. */ static struct apm_clone_data * apm_create_clone(struct cdev *dev, struct acpi_softc *acpi_sc) { struct apm_clone_data *clone; clone = malloc(sizeof(*clone), M_APMDEV, M_WAITOK); clone->cdev = dev; clone->acpi_sc = acpi_sc; clone->notify_status = APM_EV_NONE; bzero(&clone->sel_read, sizeof(clone->sel_read)); knlist_init_mtx(&clone->sel_read.si_note, &acpi_mutex); /* * The acpi device is always managed by devd(8) and is considered * writable (i.e., ack is required to allow suspend to proceed.) */ if (strcmp("acpi", devtoname(dev)) == 0) clone->flags = ACPI_EVF_DEVD | ACPI_EVF_WRITE; else clone->flags = ACPI_EVF_NONE; ACPI_LOCK(acpi); STAILQ_INSERT_TAIL(&acpi_sc->apm_cdevs, clone, entries); ACPI_UNLOCK(acpi); return (clone); } static int apmopen(struct cdev *dev, int flag, int fmt, struct thread *td) { struct acpi_softc *acpi_sc; struct apm_clone_data *clone; acpi_sc = devclass_get_softc(devclass_find("acpi"), 0); clone = apm_create_clone(dev, acpi_sc); dev->si_drv1 = clone; /* If the device is opened for write, record that. */ if ((flag & FWRITE) != 0) clone->flags |= ACPI_EVF_WRITE; return (0); } static int apmclose(struct cdev *dev, int flag, int fmt, struct thread *td) { struct apm_clone_data *clone; struct acpi_softc *acpi_sc; clone = dev->si_drv1; acpi_sc = clone->acpi_sc; /* We are about to lose a reference so check if suspend should occur */ if (acpi_sc->acpi_next_sstate != 0 && clone->notify_status != APM_EV_ACKED) acpi_AckSleepState(clone, 0); /* Remove this clone's data from the list and free it. */ ACPI_LOCK(acpi); STAILQ_REMOVE(&acpi_sc->apm_cdevs, clone, apm_clone_data, entries); + seldrain(&clone->sel_read); knlist_destroy(&clone->sel_read.si_note); ACPI_UNLOCK(acpi); free(clone, M_APMDEV); destroy_dev_sched(dev); return (0); } static int apmioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; struct apm_clone_data *clone; struct acpi_softc *acpi_sc; struct apm_info info; struct apm_event_info *ev_info; apm_info_old_t aiop; error = 0; clone = dev->si_drv1; acpi_sc = clone->acpi_sc; switch (cmd) { case APMIO_SUSPEND: if ((flag & FWRITE) == 0) return (EPERM); if (acpi_sc->acpi_next_sstate == 0) { if (acpi_sc->acpi_suspend_sx != ACPI_STATE_S5) { error = acpi_ReqSleepState(acpi_sc, acpi_sc->acpi_suspend_sx); } else { printf( "power off via apm suspend not supported\n"); error = ENXIO; } } else error = acpi_AckSleepState(clone, 0); break; case APMIO_STANDBY: if ((flag & FWRITE) == 0) return (EPERM); if (acpi_sc->acpi_next_sstate == 0) { if (acpi_sc->acpi_standby_sx != ACPI_STATE_S5) { error = acpi_ReqSleepState(acpi_sc, acpi_sc->acpi_standby_sx); } else { printf( "power off via apm standby not supported\n"); error = ENXIO; } } else error = acpi_AckSleepState(clone, 0); break; case APMIO_NEXTEVENT: printf("apm nextevent start\n"); ACPI_LOCK(acpi); if (acpi_sc->acpi_next_sstate != 0 && clone->notify_status == APM_EV_NONE) { ev_info = (struct apm_event_info *)addr; if (acpi_sc->acpi_next_sstate <= ACPI_STATE_S3) ev_info->type = PMEV_STANDBYREQ; else ev_info->type = PMEV_SUSPENDREQ; ev_info->index = 0; clone->notify_status = APM_EV_NOTIFIED; printf("apm event returning %d\n", ev_info->type); } else error = EAGAIN; ACPI_UNLOCK(acpi); break; case APMIO_GETINFO_OLD: if (acpi_capm_get_info(&info)) error = ENXIO; aiop = (apm_info_old_t)addr; aiop->ai_major = info.ai_major; aiop->ai_minor = info.ai_minor; aiop->ai_acline = info.ai_acline; aiop->ai_batt_stat = info.ai_batt_stat; aiop->ai_batt_life = info.ai_batt_life; aiop->ai_status = info.ai_status; break; case APMIO_GETINFO: if (acpi_capm_get_info((apm_info_t)addr)) error = ENXIO; break; case APMIO_GETPWSTATUS: if (acpi_capm_get_pwstatus((apm_pwstatus_t)addr)) error = ENXIO; break; case APMIO_ENABLE: if ((flag & FWRITE) == 0) return (EPERM); apm_active = 1; break; case APMIO_DISABLE: if ((flag & FWRITE) == 0) return (EPERM); apm_active = 0; break; case APMIO_HALTCPU: break; case APMIO_NOTHALTCPU: break; case APMIO_DISPLAY: if ((flag & FWRITE) == 0) return (EPERM); break; case APMIO_BIOS: if ((flag & FWRITE) == 0) return (EPERM); bzero(addr, sizeof(struct apm_bios_arg)); break; default: error = EINVAL; break; } return (error); } static int apmwrite(struct cdev *dev, struct uio *uio, int ioflag) { return (uio->uio_resid); } static int apmpoll(struct cdev *dev, int events, struct thread *td) { struct apm_clone_data *clone; int revents; revents = 0; ACPI_LOCK(acpi); clone = dev->si_drv1; if (clone->acpi_sc->acpi_next_sstate) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &clone->sel_read); ACPI_UNLOCK(acpi); return (revents); } static int apmkqfilter(struct cdev *dev, struct knote *kn) { struct apm_clone_data *clone; ACPI_LOCK(acpi); clone = dev->si_drv1; kn->kn_hook = clone; kn->kn_fop = &apm_readfiltops; knlist_add(&clone->sel_read.si_note, kn, 0); ACPI_UNLOCK(acpi); return (0); } static void apmreadfiltdetach(struct knote *kn) { struct apm_clone_data *clone; ACPI_LOCK(acpi); clone = kn->kn_hook; knlist_remove(&clone->sel_read.si_note, kn, 0); ACPI_UNLOCK(acpi); } static int apmreadfilt(struct knote *kn, long hint) { struct apm_clone_data *clone; int sleeping; ACPI_LOCK(acpi); clone = kn->kn_hook; sleeping = clone->acpi_sc->acpi_next_sstate ? 1 : 0; ACPI_UNLOCK(acpi); return (sleeping); } int acpi_machdep_init(device_t dev) { struct acpi_softc *acpi_sc; acpi_sc = devclass_get_softc(devclass_find("acpi"), 0); /* Create a clone for /dev/acpi also. */ STAILQ_INIT(&acpi_sc->apm_cdevs); acpi_sc->acpi_clone = apm_create_clone(acpi_sc->acpi_dev_t, acpi_sc); clone_setup(&apm_clones); EVENTHANDLER_REGISTER(dev_clone, apm_clone, 0, 1000); acpi_install_wakeup_handler(acpi_sc); if (intr_model == ACPI_INTR_PIC) BUS_CONFIG_INTR(dev, AcpiGbl_FADT.SciInterrupt, INTR_TRIGGER_LEVEL, INTR_POLARITY_LOW); else acpi_SetIntrModel(intr_model); SYSCTL_ADD_UINT(&acpi_sc->acpi_sysctl_ctx, SYSCTL_CHILDREN(acpi_sc->acpi_sysctl_tree), OID_AUTO, "reset_video", CTLFLAG_RW, &acpi_reset_video, 0, "Call the VESA reset BIOS vector on the resume path"); return (0); } void acpi_SetDefaultIntrModel(int model) { intr_model = model; } /* Check BIOS date. If 1998 or older, disable ACPI. */ int acpi_machdep_quirks(int *quirks) { char *va; int year; /* BIOS address 0xffff5 contains the date in the format mm/dd/yy. */ va = pmap_mapbios(0xffff0, 16); sscanf(va + 11, "%2d", &year); pmap_unmapbios((vm_offset_t)va, 16); /* * Date must be >= 1/1/1999 or we don't trust ACPI. Note that this * check must be changed by my 114th birthday. */ if (year > 90 && year < 99) *quirks = ACPI_Q_BROKEN; return (0); } void acpi_cpu_c1() { __asm __volatile("sti; hlt"); } /* * Support for mapping ACPI tables during early boot. This abuses the * crashdump map because the kernel cannot allocate KVA in * pmap_mapbios() when this is used. This makes the following * assumptions about how we use this KVA: pages 0 and 1 are used to * map in the header of each table found via the RSDT or XSDT and * pages 2 to n are used to map in the RSDT or XSDT. This has to use * 2 pages for the table headers in case a header spans a page * boundary. * * XXX: We don't ensure the table fits in the available address space * in the crashdump map. */ /* * Map some memory using the crashdump map. 'offset' is an offset in * pages into the crashdump map to use for the start of the mapping. */ static void * table_map(vm_paddr_t pa, int offset, vm_offset_t length) { vm_offset_t va, off; void *data; off = pa & PAGE_MASK; length = roundup(length + off, PAGE_SIZE); pa = pa & PG_FRAME; va = (vm_offset_t)pmap_kenter_temporary(pa, offset) + (offset * PAGE_SIZE); data = (void *)(va + off); length -= PAGE_SIZE; while (length > 0) { va += PAGE_SIZE; pa += PAGE_SIZE; length -= PAGE_SIZE; pmap_kenter(va, pa); invlpg(va); } return (data); } /* Unmap memory previously mapped with table_map(). */ static void table_unmap(void *data, vm_offset_t length) { vm_offset_t va, off; va = (vm_offset_t)data; off = va & PAGE_MASK; length = roundup(length + off, PAGE_SIZE); va &= ~PAGE_MASK; while (length > 0) { pmap_kremove(va); invlpg(va); va += PAGE_SIZE; length -= PAGE_SIZE; } } /* * Map a table at a given offset into the crashdump map. It first * maps the header to determine the table length and then maps the * entire table. */ static void * map_table(vm_paddr_t pa, int offset, const char *sig) { ACPI_TABLE_HEADER *header; vm_offset_t length; void *table; header = table_map(pa, offset, sizeof(ACPI_TABLE_HEADER)); if (strncmp(header->Signature, sig, ACPI_NAME_SIZE) != 0) { table_unmap(header, sizeof(ACPI_TABLE_HEADER)); return (NULL); } length = header->Length; table_unmap(header, sizeof(ACPI_TABLE_HEADER)); table = table_map(pa, offset, length); if (ACPI_FAILURE(AcpiTbChecksum(table, length))) { if (bootverbose) printf("ACPI: Failed checksum for table %s\n", sig); #if (ACPI_CHECKSUM_ABORT) table_unmap(table, length); return (NULL); #endif } return (table); } /* * See if a given ACPI table is the requested table. Returns the * length of the able if it matches or zero on failure. */ static int probe_table(vm_paddr_t address, const char *sig) { ACPI_TABLE_HEADER *table; table = table_map(address, 0, sizeof(ACPI_TABLE_HEADER)); if (table == NULL) { if (bootverbose) printf("ACPI: Failed to map table at 0x%jx\n", (uintmax_t)address); return (0); } if (bootverbose) printf("Table '%.4s' at 0x%jx\n", table->Signature, (uintmax_t)address); if (strncmp(table->Signature, sig, ACPI_NAME_SIZE) != 0) { table_unmap(table, sizeof(ACPI_TABLE_HEADER)); return (0); } table_unmap(table, sizeof(ACPI_TABLE_HEADER)); return (1); } /* * Try to map a table at a given physical address previously returned * by acpi_find_table(). */ void * acpi_map_table(vm_paddr_t pa, const char *sig) { return (map_table(pa, 0, sig)); } /* Unmap a table previously mapped via acpi_map_table(). */ void acpi_unmap_table(void *table) { ACPI_TABLE_HEADER *header; header = (ACPI_TABLE_HEADER *)table; table_unmap(table, header->Length); } /* * Return the physical address of the requested table or zero if one * is not found. */ vm_paddr_t acpi_find_table(const char *sig) { ACPI_PHYSICAL_ADDRESS rsdp_ptr; ACPI_TABLE_RSDP *rsdp; ACPI_TABLE_RSDT *rsdt; ACPI_TABLE_XSDT *xsdt; ACPI_TABLE_HEADER *table; vm_paddr_t addr; int i, count; if (resource_disabled("acpi", 0)) return (0); /* * Map in the RSDP. Since ACPI uses AcpiOsMapMemory() which in turn * calls pmap_mapbios() to find the RSDP, we assume that we can use * pmap_mapbios() to map the RSDP. */ if ((rsdp_ptr = AcpiOsGetRootPointer()) == 0) return (0); rsdp = pmap_mapbios(rsdp_ptr, sizeof(ACPI_TABLE_RSDP)); if (rsdp == NULL) { if (bootverbose) printf("ACPI: Failed to map RSDP\n"); return (0); } /* * For ACPI >= 2.0, use the XSDT if it is available. * Otherwise, use the RSDT. We map the XSDT or RSDT at page 2 * in the crashdump area. Pages 0 and 1 are used to map in the * headers of candidate ACPI tables. */ addr = 0; if (rsdp->Revision >= 2 && rsdp->XsdtPhysicalAddress != 0) { /* * AcpiOsGetRootPointer only verifies the checksum for * the version 1.0 portion of the RSDP. Version 2.0 has * an additional checksum that we verify first. */ if (AcpiTbChecksum((UINT8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)) { if (bootverbose) printf("ACPI: RSDP failed extended checksum\n"); return (0); } xsdt = map_table(rsdp->XsdtPhysicalAddress, 2, ACPI_SIG_XSDT); if (xsdt == NULL) { if (bootverbose) printf("ACPI: Failed to map XSDT\n"); return (0); } count = (xsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / sizeof(UINT64); for (i = 0; i < count; i++) if (probe_table(xsdt->TableOffsetEntry[i], sig)) { addr = xsdt->TableOffsetEntry[i]; break; } acpi_unmap_table(xsdt); } else { rsdt = map_table(rsdp->RsdtPhysicalAddress, 2, ACPI_SIG_RSDT); if (rsdt == NULL) { if (bootverbose) printf("ACPI: Failed to map RSDT\n"); return (0); } count = (rsdt->Header.Length - sizeof(ACPI_TABLE_HEADER)) / sizeof(UINT32); for (i = 0; i < count; i++) if (probe_table(rsdt->TableOffsetEntry[i], sig)) { addr = rsdt->TableOffsetEntry[i]; break; } acpi_unmap_table(rsdt); } pmap_unmapbios((vm_offset_t)rsdp, sizeof(ACPI_TABLE_RSDP)); if (addr == 0) { if (bootverbose) printf("ACPI: No %s table found\n", sig); return (0); } if (bootverbose) printf("%s: Found table at 0x%jx\n", sig, (uintmax_t)addr); /* * Verify that we can map the full table and that its checksum is * correct, etc. */ table = map_table(addr, 0, sig); if (table == NULL) return (0); acpi_unmap_table(table); return (addr); } /* * ACPI nexus(4) driver. */ static int nexus_acpi_probe(device_t dev) { int error; error = acpi_identify(); if (error) return (error); return (BUS_PROBE_DEFAULT); } static int nexus_acpi_attach(device_t dev) { nexus_init_resources(); bus_generic_probe(dev); if (BUS_ADD_CHILD(dev, 10, "acpi", 0) == NULL) panic("failed to add acpi0 device"); return (bus_generic_attach(dev)); } static device_method_t nexus_acpi_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nexus_acpi_probe), DEVMETHOD(device_attach, nexus_acpi_attach), { 0, 0 } }; DEFINE_CLASS_1(nexus, nexus_acpi_driver, nexus_acpi_methods, 1, nexus_driver); static devclass_t nexus_devclass; DRIVER_MODULE(nexus_acpi, root, nexus_acpi_driver, nexus_devclass, 0, 0); Index: stable/8/sys/kern/kern_event.c =================================================================== --- stable/8/sys/kern/kern_event.c (revision 225584) +++ stable/8/sys/kern/kern_event.c (revision 225585) @@ -1,2182 +1,2183 @@ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); static fo_rdwr_t kqueue_read; static fo_rdwr_t kqueue_write; static fo_truncate_t kqueue_truncate; static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static struct fileops kqueueops = { .fo_read = kqueue_read, .fo_write = kqueue_write, .fo_truncate = kqueue_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int waitok); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { 1, filt_fileattach, NULL, NULL }; static struct filterops kqread_filtops = { 1, NULL, filt_kqdetach, filt_kqueue }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { 0, filt_procattach, filt_procdetach, filt_proc }; static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not KN_INFLUX?? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) #define KN_LIST_LOCK(kn) do { \ if (kn->kn_knlist != NULL) \ kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \ } while (0) #define KN_LIST_UNLOCK(kn) do { \ if (kn->kn_knlist != NULL) \ kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \ } while (0) #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_locked((knl)->kl_lockarg); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_unlocked((knl)->kl_lockarg); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { 0, filt_nullattach, NULL, NULL }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops }, /* EVFILT_READ */ { &file_filtops }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops }, /* EVFILT_VNODE */ { &proc_filtops }, /* EVFILT_PROC */ { &sig_filtops }, /* EVFILT_SIGNAL */ { &timer_filtops }, /* EVFILT_TIMER */ { &null_filtops }, /* former EVFILT_NETDEV */ { &fs_filtops }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops }, /* EVFILT_USER */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int immediate; int error; immediate = 0; p = pfind(kn->kn_id); if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { p = zpfind(kn->kn_id); immediate = 1; } else if (p != NULL && (p->p_flag & P_WEXIT)) { immediate = 1; } if (p == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } if (immediate == 0) knlist_add(&p->p_klist, kn, 1); /* * Immediately activate any exit notes if the target process is a * zombie. This is necessary to handle the case where the target * process, e.g. a child, dies before the kevent is registered. */ if (immediate && filt_proc(kn, NOTE_EXIT)) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { struct proc *p; p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p = kn->kn_ptr.p_proc; u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { if (!(kn->kn_status & KN_DETACHED)) knlist_remove_inevent(&p->p_klist, kn); kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = p->p_xstat; kn->kn_ptr.p_proc = NULL; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; if (list == NULL) return; list->kl_lock(list->kl_lockarg); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) continue; kq = kn->kn_kq; KQ_LOCK(kq); if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, NOTE_FORK | pid)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new event to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register a knote with * new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, 0); if (kn->kn_fop->f_event(kn, NOTE_FORK | pid)) KNOTE_ACTIVATE(kn, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; KQ_LOCK(kq); kn->kn_status &= ~KN_INFLUX; KQ_UNLOCK_FLUX(kq); list->kl_lock(list->kl_lockarg); } list->kl_unlock(list->kl_lockarg); } static int timertoticks(intptr_t data) { struct timeval tv; int tticks; tv.tv_sec = data / 1000; tv.tv_usec = (data % 1000) * 1000; tticks = tvtohz(&tv); return tticks; } /* XXX - move to kern_timeout.c? */ static void filt_timerexpire(void *knx) { struct knote *kn = knx; struct callout *calloutp; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) { calloutp = (struct callout *)kn->kn_hook; callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire, kn); } } /* * data contains amount of time to sleep, in milliseconds */ /* XXX - move to kern_timeout.c? */ static int filt_timerattach(struct knote *kn) { struct callout *calloutp; atomic_add_int(&kq_ncallouts, 1); if (kq_ncallouts >= kq_calloutmax) { atomic_add_int(&kq_ncallouts, -1); return (ENOMEM); } kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */ calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); callout_init(calloutp, CALLOUT_MPSAFE); kn->kn_hook = calloutp; callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire, kn); return (0); } /* XXX - move to kern_timeout.c? */ static void filt_timerdetach(struct knote *kn) { struct callout *calloutp; calloutp = (struct callout *)kn->kn_hook; callout_drain(calloutp); free(calloutp, M_KQUEUE); atomic_add_int(&kq_ncallouts, -1); kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */ } /* XXX - move to kern_timeout.c? */ static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int kqueue(struct thread *td, struct kqueue_args *uap) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; int fd, error; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) goto done2; /* An extra reference on `nfp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK); TAILQ_INIT(&kq->kq_head); kq->kq_fdp = fdp; knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); FILEDESC_XLOCK(fdp); SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct kevent_args { int fd; const struct kevent *changelist; int nchanges; struct kevent *eventlist; int nevents; const struct timespec *timeout; }; #endif int kevent(struct thread *td, struct kevent_args *uap) { struct timespec ts, *tsp; struct kevent_copyops k_ops = { uap, kevent_copyout, kevent_copyin}; int error; #ifdef KTRACE struct uio ktruio; struct iovec ktriov; struct uio *ktruioin = NULL; struct uio *ktruioout = NULL; #endif if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) { ktriov.iov_base = uap->changelist; ktriov.iov_len = uap->nchanges * sizeof(struct kevent); ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td }; ktruioin = cloneuio(&ktruio); ktriov.iov_base = uap->eventlist; ktriov.iov_len = uap->nevents * sizeof(struct kevent); ktruioout = cloneuio(&ktruio); } #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, &k_ops, tsp); #ifdef KTRACE if (ktruioin != NULL) { ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent); ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent); ktrgenio(uap->fd, UIO_READ, ktruioout, error); } #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; struct kqueue *kq; struct file *fp; int i, n, nerrors, error; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto done_norel; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) goto done; changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, 1); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents != 0) { kevp->flags = EV_ERROR; kevp->data = error; (void) k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } else { goto done; } } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; error = 0; goto done; } error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td); done: kqueue_release(kq, 0); done_norel: fdrop(fp, td); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. waitok will * influence if memory allocation should wait. Make sure it is 0 if you * hold any mutexes. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; int error, filt, event; int haskqglobal; fp = NULL; kn = NULL; error = 0; haskqglobal = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; tkn = knote_alloc(waitok); /* prevent waiting with locks */ findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); error = fget(td, kev->ident, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, 0) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, waitok); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * if we add some inteligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) kqueue_expand(kq, fops, kev->ident, waitok); KQ_LOCK(kq); if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stablize. */ if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE); kn->kn_status = KN_INFLUX|KN_DETACHED; error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop(kn, td); goto done; } KN_LIST_LOCK(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); goto done; } /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); KN_LIST_LOCK(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already. i.e. filt_procattach is called on a zombie process. It * will call filt_proc which will remove it from the list, and NULL * kn_knlist. */ done_ev_add: event = kn->kn_fop->f_event(kn, 0); KQ_LOCK(kq); if (event) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_INFLUX; KN_LIST_UNLOCK(kn); if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) { kn->kn_status |= KN_DISABLED; } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { kn->kn_status &= ~KN_DISABLED; if ((kn->kn_status & KN_ACTIVE) && ((kn->kn_status & KN_QUEUED) == 0)) knote_enqueue(kn); } KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (fp != NULL) fdrop(fp, td); if (tkn != NULL) knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. * * Not calling hashinit w/ waitok (proper malloc flag) should be safe. * If kqueue_register is called from a non-fd context, there usually/should * be no locks held. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int size; int fd; int mflag = waitok ? M_WAITOK : M_NOWAIT; KQ_NOTOWNED(kq); to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof list, M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knlistsize > fd) { to_free = list; list = NULL; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof list); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof list, (size - kq->kq_knlistsize) * sizeof list); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask); if (tmp_knhash == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return 0; } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are INFLUX. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct timeval atv, rtv, ttv; struct knote *kn, *marker; int count, timeout, nkev, error, influx; int haskqglobal, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&atv, tsp); if (itimerfix(&atv)) { error = EINVAL; goto done_nl; } if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) timeout = -1; else timeout = atv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&atv); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; timeout = 0; } marker = knote_alloc(1); if (marker == NULL) { error = ENOMEM; goto done_nl; } marker->kn_status = KN_MARKER; KQ_LOCK(kq); goto start; retry: if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timeout = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } start: kevp = keva; if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", timeout); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || (kn->kn_status & KN_INFLUX) == KN_INFLUX) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT((kn->kn_status & KN_INFLUX) == 0, ("KN_INFLUX set when not suppose to be")); if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn->kn_status |= KN_INFLUX; kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've marked * it _INFLUX. */ *kevp = kn->kn_kevent; if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KN_LIST_LOCK(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX); kq->kq_count--; KN_LIST_UNLOCK(kn); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~(KN_INFLUX); KN_LIST_UNLOCK(kn); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; struct knote *kn; int i; int error; if ((error = kqueue_acquire(fp, &kq))) return error; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); fdp = kq->kq_fdp; KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); FILEDESC_XLOCK(fdp); SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list); FILEDESC_XUNLOCK(fdp); + seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); kq->kq_fdp = NULL; if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and set KN_INFLUX), we can eliminate * the kqueue scheduling, but this will introduce four * lock/unlock's for each knote to test. If we do, continue to use * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is * only safe if you want to remove the current item, which we are * not doing. */ SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) { KQ_LOCK(kq); if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn->kn_status &= ~KN_INFLUX; if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); } } kq = NULL; } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED")); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); if (!kqislocked) KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX, ("knlist_remove called w/o knote being KN_INFLUX or already removed")); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) knl->kl_unlock(knl->kl_lockarg); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove all knotes from a specified klist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } /* * remove knote from a specified klist while in f_event handler. */ void knlist_remove_inevent(struct knlist *knl, struct knote *kn) { knlist_remove_kq(knl, kn, 1, (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return SLIST_EMPTY(&knl->kl_list); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_locked(void *arg) { mtx_assert((struct mtx *)arg, MA_OWNED); } static void knlist_mtx_assert_unlocked(void *arg) { mtx_assert((struct mtx *)arg, MA_NOTOWNED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_locked == NULL) knl->kl_assert_locked = knlist_mtx_assert_locked; else knl->kl_assert_locked = kl_assert_locked; if (kl_assert_unlocked == NULL) knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; else knl->kl_assert_unlocked = kl_assert_unlocked; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL, NULL); } void knlist_destroy(struct knlist *knl) { #ifdef INVARIANTS /* * if we run across this error, we need to find the offending * driver and have it call knlist_clear. */ if (!SLIST_EMPTY(&knl->kl_list)) printf("WARNING: destroying knlist w/ knotes on it!\n"); #endif knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL; SLIST_INIT(&knl->kl_list); } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if ((kn->kn_status & KN_INFLUX)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn->kn_status |= KN_INFLUX | KN_DETACHED; KQ_UNLOCK(kq); knote_drop(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= (EV_EOF | EV_ONESHOT); KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still KN_INFLUX remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn->kn_status & KN_INFLUX, ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn->kn_status & KN_INFLUX) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); influx = 1; KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX")); KQ_OWNED(kq); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return ENOMEM; list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return ENOMEM; list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return 0; } /* * knote must already have been detached using the f_detach method. * no lock need to be held, it is assumed that the KN_INFLUX flag is set * to prevent other removal. */ static void knote_drop(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KQ_NOTOWNED(kq); KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX, ("knote_drop called without KN_INFLUX set in kn_status")); KQ_LOCK(kq); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int waitok) { return ((struct knote *)uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO)); } static void knote_free(struct knote *kn) { if (kn != NULL) uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) { struct kqueue *kq; struct file *fp; int error; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, waitok); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return error; } Index: stable/8/sys/kern/sys_generic.c =================================================================== --- stable/8/sys/kern/sys_generic.c (revision 225584) +++ stable/8/sys/kern/sys_generic.c (revision 225585) @@ -1,1635 +1,1652 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); static int pollscan(struct thread *, struct pollfd *, u_int); static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selrescan(struct thread *, fd_mask **, fd_mask **); static void selfdalloc(struct thread *, void *); static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); static int seltdwait(struct thread *, int); static void seltdclear(struct thread *); /* * One seltd per-thread allocated on demand as needed. * * t - protected by st_mtx * k - Only accessed by curthread or read-only */ struct seltd { STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ struct selfd *st_free1; /* (k) free fd for read set. */ struct selfd *st_free2; /* (k) free fd for write set. */ struct mtx st_mtx; /* Protects struct seltd */ struct cv st_wait; /* (t) Wait channel. */ int st_flags; /* (t) SELTD_ flags. */ }; #define SELTD_PENDING 0x0001 /* We have pending events. */ #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ /* * One selfd allocated per-thread per-file-descriptor. * f - protected by sf_mtx */ struct selfd { STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ struct selinfo *sf_si; /* (f) selinfo when linked. */ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ struct seltd *sf_td; /* (k) owning seltd. */ void *sf_cookie; /* (k) fd or pollfd. */ }; static uma_zone_t selfd_zone; static struct mtx_pool *mtxpool_select; #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int read(td, uap) struct thread *td; struct read_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return(error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pread(td, uap) struct thread *td; struct pread_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pread(td, uap) struct thread *td; struct freebsd6_pread_args *uap; { struct pread_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (pread(td, &oargs)); } /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_preadv(td, fd, auio, offset) struct thread *td; int fd; struct uio *auio; off_t offset; { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return(0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int write(td, uap) struct thread *td; struct write_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return(error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pwrite(td, uap) struct thread *td; struct pwrite_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pwrite(td, uap) struct thread *td; struct freebsd6_pwrite_args *uap; { struct pwrite_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (pwrite(td, &oargs)); } /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_pwritev(td, fd, auio, offset) struct thread *td; struct uio *auio; int fd; off_t offset; { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Socket layer is responsible for issuing SIGPIPE. */ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { PROC_LOCK(td->td_proc); tdksignal(td, SIGPIPE, NULL); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Truncate a file given a file descriptor. * * Can't use fget_write() here, since must return EINVAL and not EBADF if the * descriptor isn't writable. */ int kern_ftruncate(td, fd, length) struct thread *td; int fd; off_t length; { struct file *fp; int error; AUDIT_ARG_FD(fd); if (length < 0) return (EINVAL); error = fget(td, fd, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if (!(fp->f_flag & FWRITE)) { fdrop(fp, td); return (EINVAL); } error = fo_truncate(fp, length, td->td_ucred, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int ftruncate(td, uap) struct thread *td; struct ftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(td, uap) struct thread *td; struct oftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int ioctl(struct thread *td, struct ioctl_args *uap) { u_long com; int arg, error; u_int size; caddr_t data; if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_name, uap->com); uap->com &= 0xffffffff; } com = uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (com & IOC_VOID) { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } else data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error) { if (size > 0) free(data, M_IOCTLOPS); return (error); } } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); if (size > 0) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; int error; int tmp; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); if ((error = fget(td, fd, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIOCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIONBIO: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FASYNC); else atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: fdrop(fp, td); return (error); } int poll_no_poll(int events) { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. */ if (events & ~POLLSTANDARD) return (POLLNVAL); return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } int pselect(struct thread *td, struct pselect_args *uap) { struct timespec ts; struct timeval tv, *tvp; sigset_t set, *uset; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&tv, &ts); tvp = &tv; } else tvp = NULL; if (uap->sm != NULL) { error = copyin(uap->sm, &set, sizeof(set)); if (error != 0) return (error); uset = &set; } else uset = NULL; return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, uset, NFDBITS)); } int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits) { int error; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error != 0) return (error); td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); return (error); } #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int select(struct thread *td, struct select_args *uap) { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, NFDBITS)); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv, rtv, ttv; int error, timo; u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; if (nd > fdp->fd_lastfile + 1) nd = fdp->fd_lastfile + 1; /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; ncpubytes = roundup(nd, abi_nfdbits) / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) { \ ibits[x] = NULL; \ obits[x] = NULL; \ } else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ bzero((char *)ibits[x] + ncpubytes, \ ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) /* * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, * we are running under 32-bit emulation. This should be more * generic. */ #define swizzle_fdset(bits) \ if (abi_nfdbits != NFDBITS && bits != NULL) { \ int i; \ for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ } #else #define swizzle_fdset(bits) #endif /* Make sure the bit order makes it through an ABI transition */ swizzle_fdset(ibits[0]); swizzle_fdset(ibits[1]); swizzle_fdset(ibits[2]); if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) { error = EINVAL; goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) break; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); if (error) break; error = selrescan(td, ibits, obits); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; /* swizzle bit order back, if necessary */ swizzle_fdset(obits[0]); swizzle_fdset(obits[1]); swizzle_fdset(obits[2]); #undef swizzle_fdset #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } /* * Convert a select bit set to poll flags. * * The backend always returns POLLHUP/POLLERR if appropriate and we * return this as a set bit in any set. */ static int select_flags[3] = { POLLRDNORM | POLLHUP | POLLERR, POLLWRNORM | POLLHUP | POLLERR, POLLRDBAND | POLLERR }; /* * Compute the fo_poll flags required for a fd given by the index and * bit position in the fd_mask array. */ static __inline int selflags(fd_mask **ibits, int idx, fd_mask bit) { int flags; int msk; flags = 0; for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; flags |= select_flags[msk]; } return (flags); } /* * Set the appropriate output bits given a mask of fired events and the * input bits originally requested. */ static __inline int selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) { int msk; int n; n = 0; for (msk = 0; msk < 3; msk++) { if ((events & select_flags[msk]) == 0) continue; if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; /* * XXX Check for a duplicate set. This can occur because a * socket calls selrecord() twice for each poll() call * resulting in two selfds per real fd. selrescan() will * call selsetbits twice as a result. */ if ((obits[msk][idx] & bit) != 0) continue; obits[msk][idx] |= bit; n++; } return (n); } /* * Traverse the list of fds attached to this thread's seltd and check for * completion. */ static int selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) { struct filedesc *fdp; struct selinfo *si; struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct file *fp; fd_mask bit; int fd, ev, n, idx; fdp = td->td_proc->p_fd; stp = td->td_sel; n = 0; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } stp->st_flags = 0; td->td_retval[0] = n; return (0); } /* * Perform the initial filedescriptor scan and register ourselves with * each selinfo. */ static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { struct filedesc *fdp; struct file *fp; fd_mask bit; int ev, flags, end, fd; int n, idx; fdp = td->td_proc->p_fd; n = 0; for (idx = 0, fd = 0; fd < nfd; idx++) { end = imin(fd + NFDBITS, nfd); for (bit = 1; fd < end; bit <<= 1, fd++) { /* Compute the list of events we're interested in. */ flags = selflags(ibits, idx, bit); if (flags == 0) continue; if ((fp = fget_unlocked(fdp, fd)) == NULL) return (EBADF); selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } } td->td_retval[0] = n; return (0); } #ifndef _SYS_SYSPROTO_H_ struct poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int poll(td, uap) struct thread *td; struct poll_args *uap; { struct pollfd *bits; struct pollfd smallbits[32]; struct timeval atv, rtv, ttv; int error = 0, timo; u_int nfds; size_t ni; nfds = uap->nfds; if (nfds > maxfilesperproc && nfds > FD_SETSIZE) return (EINVAL); ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) goto done; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, bits, nfds); if (error || td->td_retval[0] != 0) break; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) break; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); if (error) break; error = pollrescan(td); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = pollout(td, bits, uap->fds, nfds); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); return (error); } static int pollrescan(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct filedesc *fdp; struct file *fp; struct pollfd *fd; int n; n = 0; fdp = td->td_proc->p_fd; stp = td->td_sel; FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (struct pollfd *)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; fp = fdp->fd_ofiles[fd->fd]; if (fp == NULL) { fd->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); if (fd->revents != 0) n++; } FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); } static int pollout(td, fds, ufds, nfd) struct thread *td; struct pollfd *fds; struct pollfd *ufds; u_int nfd; { int error = 0; u_int i = 0; u_int n = 0; for (i = 0; i < nfd; i++) { error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); if (fds->revents != 0) n++; fds++; ufds++; } td->td_retval[0] = n; return (0); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; FILEDESC_SLOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd >= fdp->fd_nfiles) { fds->revents = POLLNVAL; n++; } else if (fds->fd < 0) { fds->revents = 0; } else { fp = fdp->fd_ofiles[fds->fd]; if (fp == NULL) { fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); /* * POSIX requires POLLOUT to be never * set simultaneously with POLLHUP. */ if ((fds->revents & POLLHUP) != 0) fds->revents &= ~POLLOUT; if (fds->revents != 0) n++; } } } FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } /* * OpenBSD poll system call. * * XXX this isn't quite a true representation.. OpenBSD uses select ops. */ #ifndef _SYS_SYSPROTO_H_ struct openbsd_poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int openbsd_poll(td, uap) register struct thread *td; register struct openbsd_poll_args *uap; { return (poll(td, (struct poll_args *)uap)); } /* * XXX This was created specifically to support netncp and netsmb. This * allows the caller to specify a socket to wait for events on. It returns * 0 if any events matched and an error otherwise. There is no way to * determine which events fired. */ int selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { struct timeval atv, rtv, ttv; int error, timo; if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) return (EINVAL); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* * Iterate until the timeout expires or the socket becomes ready. */ for (;;) { selfdalloc(td, NULL); error = sopoll(so, events, NULL, td); /* error here is actually the ready events. */ if (error) return (0); if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) { seltdclear(td); return (EWOULDBLOCK); } ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); seltdclear(td); if (error) break; } /* XXX Duplicates ncp/smb behavior. */ if (error == ERESTART) error = 0; return (error); } /* * Preallocate two selfds associated with 'cookie'. Some fo_poll routines * have two select sets, one for read and another for write. */ static void selfdalloc(struct thread *td, void *cookie) { struct seltd *stp; stp = td->td_sel; if (stp->st_free1 == NULL) stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free1->sf_td = stp; stp->st_free1->sf_cookie = cookie; if (stp->st_free2 == NULL) stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free2->sf_td = stp; stp->st_free2->sf_cookie = cookie; } static void selfdfree(struct seltd *stp, struct selfd *sfp) { STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); mtx_lock(sfp->sf_mtx); if (sfp->sf_si) TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); mtx_unlock(sfp->sf_mtx); uma_zfree(selfd_zone, sfp); } +/* Drain the waiters tied to all the selfd belonging the specified selinfo. */ +void +seldrain(sip) + struct selinfo *sip; +{ + + /* + * This feature is already provided by doselwakeup(), thus it is + * enough to go for it. + * Eventually, the context, should take care to avoid races + * between thread calling select()/poll() and file descriptor + * detaching, but, again, the races are just the same as + * selwakeup(). + */ + doselwakeup(sip, -1); +} + /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { struct selfd *sfp; struct seltd *stp; struct mtx *mtxp; stp = selector->td_sel; /* * Don't record when doing a rescan. */ if (stp->st_flags & SELTD_RESCAN) return; /* * Grab one of the preallocated descriptors. */ sfp = NULL; if ((sfp = stp->st_free1) != NULL) stp->st_free1 = NULL; else if ((sfp = stp->st_free2) != NULL) stp->st_free2 = NULL; else panic("selrecord: No free selfd on selq"); mtxp = sip->si_mtx; if (mtxp == NULL) mtxp = mtx_pool_find(mtxpool_select, sip); /* * Initialize the sfp and queue it in the thread. */ sfp->sf_si = sip; sfp->sf_mtx = mtxp; STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* * Now that we've locked the sip, check for initialization. */ mtx_lock(mtxp); if (sip->si_mtx == NULL) { sip->si_mtx = mtxp; TAILQ_INIT(&sip->si_tdlist); } /* * Add this thread to the list of selfds listening on this selinfo. */ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ void selwakeup(sip) struct selinfo *sip; { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(sip, pri) struct selinfo *sip; int pri; { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(sip, pri) struct selinfo *sip; int pri; { struct selfd *sfp; struct selfd *sfn; struct seltd *stp; /* If it's not initialized there can't be any waiters. */ if (sip->si_mtx == NULL) return; /* * Locking the selinfo locks all selfds associated with it. */ mtx_lock(sip->si_mtx); TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { /* * Once we remove this sfp from the list and clear the * sf_si seltdclear will know to ignore this si. */ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); sfp->sf_si = NULL; stp = sfp->sf_td; mtx_lock(&stp->st_mtx); stp->st_flags |= SELTD_PENDING; cv_broadcastpri(&stp->st_wait, pri); mtx_unlock(&stp->st_mtx); } mtx_unlock(sip->si_mtx); } static void seltdinit(struct thread *td) { struct seltd *stp; if ((stp = td->td_sel) != NULL) goto out; td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); cv_init(&stp->st_wait, "select"); out: stp->st_flags = 0; STAILQ_INIT(&stp->st_selq); } static int seltdwait(struct thread *td, int timo) { struct seltd *stp; int error; stp = td->td_sel; /* * An event of interest may occur while we do not hold the seltd * locked so check the pending flag before we sleep. */ mtx_lock(&stp->st_mtx); /* * Any further calls to selrecord will be a rescan. */ stp->st_flags |= SELTD_RESCAN; if (stp->st_flags & SELTD_PENDING) { mtx_unlock(&stp->st_mtx); return (0); } if (timo > 0) error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); return (error); } void seltdfini(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp == NULL) return; if (stp->st_free1) uma_zfree(selfd_zone, stp->st_free1); if (stp->st_free2) uma_zfree(selfd_zone, stp->st_free2); td->td_sel = NULL; free(stp, M_SELECT); } /* * Remove the references to the thread from all of the objects we were * polling. */ static void seltdclear(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; stp = td->td_sel; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) selfdfree(stp, sfp); stp->st_flags = 0; } static void selectinit(void *); SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void selectinit(void *dummy __unused) { selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); } Index: stable/8/sys/kern/sys_pipe.c =================================================================== --- stable/8/sys/kern/sys_pipe.c (revision 225584) +++ stable/8/sys/kern/sys_pipe.c (revision 225585) @@ -1,1644 +1,1645 @@ /*- * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_rfiltops = { 1, NULL, filt_pipedetach, filt_piperead }; static struct filterops pipe_wfiltops = { 1, NULL, filt_pipedetach, filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, int backing); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static __inline void pipeselwakeup(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); } static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); vfs_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2]) { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; struct pipepair *pp; struct pipe *rpipe, *wpipe; int fd, error; pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* Only the forward direction pipe is backed by default */ if ((error = pipe_create(rpipe, 1)) != 0 || (error = pipe_create(wpipe, 0)) != 0) { pipeclose(rpipe); pipeclose(wpipe); return (error); } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(td, &rf, &fd); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc(). */ fildes[0] = fd; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops); error = falloc(td, &wf, &fd); if (error) { fdclose(fdp, rf, fildes[0], td); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc(). */ finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } /* ARGSUSED */ int pipe(struct thread *td, struct pipe_args *uap) { int error; int fildes[2]; error = kern_pipe(td, fildes); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(cpipe, size) struct pipe *cpipe; int size; { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *) &buffer, size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { if ((cpipe->pipe_buffer.buffer == NULL) && (size > SMALL_PIPE_SIZE)) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(pipe, backing) struct pipe *pipe; int backing; { int error; if (backing) { if (amountpipekva > maxpipekva / 2) error = pipespace_new(pipe, SMALL_PIPE_SIZE); else error = pipespace_new(pipe, PIPE_SIZE); } else { /* If we're not backing this pipe, no need to do anything. */ error = 0; } return (error); } /* ARGSUSED */ static int pipe_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct pipe *rpipe = fp->f_data; int error; int nread = 0; u_int size; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if (!(rpipe->pipe_state & PIPE_DIRECTW) && (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { pmap_t pmap; u_int size; int i, j; vm_offset_t addr, endaddr; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); KASSERT(wpipe->pipe_state & PIPE_DIRECTW, ("Clone attempt on non-direct write pipe!")); size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; pmap = vmspace_pmap(curproc->p_vmspace); endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); if (endaddr < addr) return (EFAULT); for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { /* * vm_fault_quick() can sleep. Consequently, * vm_page_lock_queue() and vm_page_unlock_queue() * should not be performed outside of this loop. */ race: if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { vm_page_lock_queues(); for (j = 0; j < i; j++) vm_page_unhold(wpipe->pipe_map.ms[j]); vm_page_unlock_queues(); return (EFAULT); } wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, VM_PROT_READ); if (wpipe->pipe_map.ms[i] == NULL) goto race; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); vm_page_lock_queues(); for (i = 0; i < wpipe->pipe_map.npages; i++) { vm_page_unhold(wpipe->pipe_map.ms[i]); } vm_page_unlock_queues(); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); error = pipelock(wpipe, 1); if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (error) { pipeunlock(wpipe); goto error1; } while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; else goto retry; } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; else goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_LOCK(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { int error = 0; int desiredsize, orig_resid; struct pipe *wpipe, *rpipe; rpipe = fp->f_data; wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if ((amountpipekva > (3 * maxpipekva) / 4) && (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if ((desiredsize != wpipe->pipe_buffer.size) && ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } if (wpipe->pipe_buffer.size == 0) { /* * This can only happen for reverse direction use of pipes * in a complete OOM situation. */ error = ENOMEM; --wpipe->pipe_busy; pipeunlock(wpipe); PIPE_UNLOCK(wpipe); return (error); } pipeunlock(wpipe); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; pipelock(wpipe, 0); if (wpipe->pipe_state & PIPE_EOF) { pipeunlock(wpipe); error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= PIPE_MINDIRECT && wpipe->pipe_buffer.size >= PIPE_MINDIRECT && (fp->f_flag & FNONBLOCK) == 0) { pipeunlock(wpipe); error = pipe_direct_write(wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (error) break; else continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } pipeunlock(wpipe); if (error != 0) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; pipeunlock(wpipe); break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; } } pipelock(wpipe, 0); --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) { error = 0; } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* ARGSUSED */ static int pipe_truncate(fp, length, active_cred, td) struct file *fp; off_t length; struct ucred *active_cred; struct thread *td; { return (EINVAL); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(fp, cmd, data, active_cred, td) struct file *fp; u_long cmd; void *data; struct ucred *active_cred; struct thread *td; { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct pipe *rpipe = fp->f_data; struct pipe *wpipe; int revents = 0; #ifdef MAC int error; #endif wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(fp, ub, active_cred, td) struct file *fp; struct stat *ub; struct ucred *active_cred; struct thread *td; { struct pipe *pipe = fp->f_data; #ifdef MAC int error; PIPE_LOCK(pipe); error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) return (error); #endif bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_state & PIPE_DIRECTW) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { struct pipe *cpipe = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; funsetown(&cpipe->pipe_sigio); pipeclose(cpipe); return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_map.cnt = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; + seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; cpipe = kn->kn_fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = cpipe->pipe_peer; break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; PIPE_LOCK(cpipe); if (kn->kn_filter == EVFILT_WRITE) cpipe = cpipe->pipe_peer; knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; int ret; PIPE_LOCK(rpipe); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } ret = kn->kn_data > 0; PIPE_UNLOCK(rpipe); return ret; } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; PIPE_UNLOCK(rpipe); return (kn->kn_data >= PIPE_BUF); } Index: stable/8/sys/kern/tty.c =================================================================== --- stable/8/sys/kern/tty.c (revision 225584) +++ stable/8/sys/kern/tty.c (revision 225585) @@ -1,2149 +1,2151 @@ /*- * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #ifdef COMPAT_43TTY #include #endif /* COMPAT_43TTY */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TTYDEFCHARS #include #undef TTYDEFCHARS #include #include #include static MALLOC_DEFINE(M_TTY, "tty", "tty device"); static void tty_rel_free(struct tty *tp); static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list); static struct sx tty_list_sx; SX_SYSINIT(tty_list, &tty_list_sx, "tty list"); static unsigned int tty_list_count = 0; /* Character device of /dev/console. */ static struct cdev *dev_console; static const char *dev_console_filename; /* * Flags that are supported and stored by this implementation. */ #define TTYSUP_IFLAG (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\ INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL) #define TTYSUP_OFLAG (OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET) #define TTYSUP_LFLAG (ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\ ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\ FLUSHO|NOKERNINFO|NOFLSH) #define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ CDSR_OFLOW|CCAR_OFLOW) #define TTY_CALLOUT(tp,d) ((d) != (tp)->t_dev && (d) != dev_console) /* * Set TTY buffer sizes. */ #define TTYBUF_MAX 65536 static void tty_watermarks(struct tty *tp) { size_t bs; /* Provide an input buffer for 0.2 seconds of data. */ bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX); ttyinq_setsize(&tp->t_inq, tp, bs); /* Set low watermark at 10% (when 90% is available). */ tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10; /* Provide an ouput buffer for 0.2 seconds of data. */ bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX); ttyoutq_setsize(&tp->t_outq, tp, bs); /* Set low watermark at 10% (when 90% is available). */ tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10; } static int tty_drain(struct tty *tp) { int error; if (ttyhook_hashook(tp, getc_inject)) /* buffer is inaccessible */ return (0); while (ttyoutq_bytesused(&tp->t_outq) > 0) { ttydevsw_outwakeup(tp); /* Could be handled synchronously. */ if (ttyoutq_bytesused(&tp->t_outq) == 0) return (0); /* Wait for data to be drained. */ error = tty_wait(tp, &tp->t_outwait); if (error) return (error); } return (0); } /* * Though ttydev_enter() and ttydev_leave() seem to be related, they * don't have to be used together. ttydev_enter() is used by the cdev * operations to prevent an actual operation from being processed when * the TTY has been abandoned. ttydev_leave() is used by ttydev_open() * and ttydev_close() to determine whether per-TTY data should be * deallocated. */ static __inline int ttydev_enter(struct tty *tp) { tty_lock(tp); if (tty_gone(tp) || !tty_opened(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } return (0); } static void ttydev_leave(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) { /* Device is still opened somewhere. */ tty_unlock(tp); return; } tp->t_flags |= TF_OPENCLOSE; /* Stop asynchronous I/O. */ funsetown(&tp->t_sigio); /* Remove console TTY. */ if (constty == tp) constty_clear(); /* Drain any output. */ MPASS((tp->t_flags & TF_STOPPED) == 0); if (!tty_gone(tp)) tty_drain(tp); ttydisc_close(tp); /* Destroy associated buffers already. */ ttyinq_free(&tp->t_inq); tp->t_inlow = 0; ttyoutq_free(&tp->t_outq); tp->t_outlow = 0; knlist_clear(&tp->t_inpoll.si_note, 1); knlist_clear(&tp->t_outpoll.si_note, 1); if (!tty_gone(tp)) ttydevsw_close(tp); tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); tty_rel_free(tp); } /* * Operations that are exposed through the character device in /dev. */ static int ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct tty *tp = dev->si_drv1; int error = 0; tty_lock(tp); if (tty_gone(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } /* * Block when other processes are currently opening or closing * the TTY. */ while (tp->t_flags & TF_OPENCLOSE) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) { tty_unlock(tp); return (error); } } tp->t_flags |= TF_OPENCLOSE; /* * Make sure the "tty" and "cua" device cannot be opened at the * same time. */ if (TTY_CALLOUT(tp, dev)) { if (tp->t_flags & TF_OPENED_IN) { error = EBUSY; goto done; } } else { if (tp->t_flags & TF_OPENED_OUT) { error = EBUSY; goto done; } } if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) { error = EBUSY; goto done; } if (!tty_opened(tp)) { /* Set proper termios flags. */ if (TTY_CALLOUT(tp, dev)) { tp->t_termios = tp->t_termios_init_out; } else { tp->t_termios = tp->t_termios_init_in; } ttydevsw_param(tp, &tp->t_termios); ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); error = ttydevsw_open(tp); if (error != 0) goto done; ttydisc_open(tp); tty_watermarks(tp); } /* Wait for Carrier Detect. */ if (!TTY_CALLOUT(tp, dev) && (oflags & O_NONBLOCK) == 0 && (tp->t_termios.c_cflag & CLOCAL) == 0) { while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) goto done; } } if (dev == dev_console) tp->t_flags |= TF_OPENED_CONS; else if (TTY_CALLOUT(tp, dev)) tp->t_flags |= TF_OPENED_OUT; else tp->t_flags |= TF_OPENED_IN; done: tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (error); } static int ttydev_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct tty *tp = dev->si_drv1; tty_lock(tp); /* * Don't actually close the device if it is being used as the * console. */ MPASS((tp->t_flags & TF_OPENED) != TF_OPENED); if (dev == dev_console) tp->t_flags &= ~TF_OPENED_CONS; else tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT); if (tp->t_flags & TF_OPENED) { tty_unlock(tp); return (0); } /* * This can only be called once. The callin and the callout * devices cannot be opened at the same time. */ tp->t_flags &= ~(TF_EXCLUDE|TF_STOPPED); /* Properly wake up threads that are stuck - revoke(). */ tp->t_revokecnt++; tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (0); } static __inline int tty_is_ctty(struct tty *tp, struct proc *p) { tty_lock_assert(tp, MA_OWNED); return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT); } static int tty_wait_background(struct tty *tp, struct thread *td, int sig) { struct proc *p = td->td_proc; struct pgrp *pg; ksiginfo_t ksi; int error; MPASS(sig == SIGTTIN || sig == SIGTTOU); tty_lock_assert(tp, MA_OWNED); for (;;) { PROC_LOCK(p); /* * The process should only sleep, when: * - This terminal is the controling terminal * - Its process group is not the foreground process * group * - The parent process isn't waiting for the child to * exit * - the signal to send to the process isn't masked */ if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) { /* Allow the action to happen. */ PROC_UNLOCK(p); return (0); } if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) || SIGISMEMBER(td->td_sigmask, sig)) { /* Only allow them in write()/ioctl(). */ PROC_UNLOCK(p); return (sig == SIGTTOU ? 0 : EIO); } pg = p->p_pgrp; if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) { /* Don't allow the action to happen. */ PROC_UNLOCK(p); return (EIO); } PROC_UNLOCK(p); /* * Send the signal and sleep until we're the new * foreground process group. */ if (sig != 0) { ksiginfo_init(&ksi); ksi.ksi_code = SI_KERNEL; ksi.ksi_signo = sig; sig = 0; } PGRP_LOCK(pg); pgsignal(pg, ksi.ksi_signo, 1, &ksi); PGRP_UNLOCK(pg); error = tty_wait(tp, &tp->t_bgwait); if (error) return (error); } } static int ttydev_read(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) goto done; error = tty_wait_background(tp, curthread, SIGTTIN); if (error) { tty_unlock(tp); goto done; } error = ttydisc_read(tp, uio, ioflag); tty_unlock(tp); /* * The read() call should not throw an error when the device is * being destroyed. Silently convert it to an EOF. */ done: if (error == ENXIO) error = 0; return (error); } static int ttydev_write(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); if (tp->t_termios.c_lflag & TOSTOP) { error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) { /* Allow non-blocking writes to bypass serialization. */ error = ttydisc_write(tp, uio, ioflag); } else { /* Serialize write() calls. */ while (tp->t_flags & TF_BUSY_OUT) { error = tty_wait(tp, &tp->t_outserwait); if (error) goto done; } tp->t_flags |= TF_BUSY_OUT; error = ttydisc_write(tp, uio, ioflag); tp->t_flags &= ~TF_BUSY_OUT; cv_signal(&tp->t_outserwait); } done: tty_unlock(tp); return (error); } static int ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (cmd) { case TIOCCBRK: case TIOCCONS: case TIOCDRAIN: case TIOCEXCL: case TIOCFLUSH: case TIOCNXCL: case TIOCSBRK: case TIOCSCTTY: case TIOCSETA: case TIOCSETAF: case TIOCSETAW: case TIOCSPGRP: case TIOCSTART: case TIOCSTAT: case TIOCSTI: case TIOCSTOP: case TIOCSWINSZ: #if 0 case TIOCSDRAINWAIT: case TIOCSETD: #endif #ifdef COMPAT_43TTY case TIOCLBIC: case TIOCLBIS: case TIOCLSET: case TIOCSETC: case OTIOCSETD: case TIOCSETN: case TIOCSETP: case TIOCSLTC: #endif /* COMPAT_43TTY */ /* * If the ioctl() causes the TTY to be modified, let it * wait in the background. */ error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) { struct termios *old = &tp->t_termios; struct termios *new = (struct termios *)data; struct termios *lock = TTY_CALLOUT(tp, dev) ? &tp->t_termios_lock_out : &tp->t_termios_lock_in; int cc; /* * Lock state devices. Just overwrite the values of the * commands that are currently in use. */ new->c_iflag = (old->c_iflag & lock->c_iflag) | (new->c_iflag & ~lock->c_iflag); new->c_oflag = (old->c_oflag & lock->c_oflag) | (new->c_oflag & ~lock->c_oflag); new->c_cflag = (old->c_cflag & lock->c_cflag) | (new->c_cflag & ~lock->c_cflag); new->c_lflag = (old->c_lflag & lock->c_lflag) | (new->c_lflag & ~lock->c_lflag); for (cc = 0; cc < NCCS; ++cc) if (lock->c_cc[cc]) new->c_cc[cc] = old->c_cc[cc]; if (lock->c_ispeed) new->c_ispeed = old->c_ispeed; if (lock->c_ospeed) new->c_ospeed = old->c_ospeed; } error = tty_ioctl(tp, cmd, data, fflag, td); done: tty_unlock(tp); return (error); } static int ttydev_poll(struct cdev *dev, int events, struct thread *td) { struct tty *tp = dev->si_drv1; int error, revents = 0; error = ttydev_enter(tp); if (error) return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); if (events & (POLLIN|POLLRDNORM)) { /* See if we can read something. */ if (ttydisc_read_poll(tp) > 0) revents |= events & (POLLIN|POLLRDNORM); } if (tp->t_flags & TF_ZOMBIE) { /* Hangup flag on zombie state. */ revents |= POLLHUP; } else if (events & (POLLOUT|POLLWRNORM)) { /* See if we can write something. */ if (ttydisc_write_poll(tp) > 0) revents |= events & (POLLOUT|POLLWRNORM); } if (revents == 0) { if (events & (POLLIN|POLLRDNORM)) selrecord(td, &tp->t_inpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &tp->t_outpoll); } tty_unlock(tp); return (revents); } static int ttydev_mmap(struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, int nprot) { struct tty *tp = dev->si_drv1; int error; /* Handle mmap() through the driver. */ error = ttydev_enter(tp); if (error) return (-1); error = ttydevsw_mmap(tp, offset, paddr, nprot); tty_unlock(tp); return (error); } /* * kqueue support. */ static void tty_kqops_read_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_inpoll.si_note, kn, 0); } static int tty_kqops_read_event(struct knote *kn, long hint) { struct tty *tp = kn->kn_hook; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_read_poll(tp); return (kn->kn_data > 0); } } static void tty_kqops_write_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_outpoll.si_note, kn, 0); } static int tty_kqops_write_event(struct knote *kn, long hint) { struct tty *tp = kn->kn_hook; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp)) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_write_poll(tp); return (kn->kn_data > 0); } } static struct filterops tty_kqops_read = { 1, NULL, tty_kqops_read_detach, tty_kqops_read_event }; static struct filterops tty_kqops_write = { 1, NULL, tty_kqops_write_detach, tty_kqops_write_event }; static int ttydev_kqfilter(struct cdev *dev, struct knote *kn) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_read; knlist_add(&tp->t_inpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_write; knlist_add(&tp->t_outpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static struct cdevsw ttydev_cdevsw = { .d_version = D_VERSION, .d_open = ttydev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttydev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttydev", .d_flags = D_TTY, }; /* * Init/lock-state devices */ static int ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct tty *tp = dev->si_drv1; int error = 0; tty_lock(tp); if (tty_gone(tp)) error = ENODEV; tty_unlock(tp); return (error); } static int ttyil_close(struct cdev *dev, int flag, int mode, struct thread *td) { return (0); } static int ttyil_rdwr(struct cdev *dev, struct uio *uio, int ioflag) { return (ENODEV); } static int ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error = 0; tty_lock(tp); if (tty_gone(tp)) { error = ENODEV; goto done; } switch (cmd) { case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = *(struct termios*)dev->si_drv2; break; case TIOCSETA: /* Set terminal flags through tcsetattr(). */ error = priv_check(td, PRIV_TTY_SETA); if (error) break; *(struct termios*)dev->si_drv2 = *(struct termios*)data; break; case TIOCGETD: *(int *)data = TTYDISC; break; case TIOCGWINSZ: bzero(data, sizeof(struct winsize)); break; default: error = ENOTTY; } done: tty_unlock(tp); return (error); } static struct cdevsw ttyil_cdevsw = { .d_version = D_VERSION, .d_open = ttyil_open, .d_close = ttyil_close, .d_read = ttyil_rdwr, .d_write = ttyil_rdwr, .d_ioctl = ttyil_ioctl, .d_name = "ttyil", .d_flags = D_TTY, }; static void tty_init_termios(struct tty *tp) { struct termios *t = &tp->t_termios_init_in; t->c_cflag = TTYDEF_CFLAG; t->c_iflag = TTYDEF_IFLAG; t->c_lflag = TTYDEF_LFLAG; t->c_oflag = TTYDEF_OFLAG; t->c_ispeed = TTYDEF_SPEED; t->c_ospeed = TTYDEF_SPEED; memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars); tp->t_termios_init_out = *t; } void tty_init_console(struct tty *tp, speed_t s) { struct termios *ti = &tp->t_termios_init_in; struct termios *to = &tp->t_termios_init_out; if (s != 0) { ti->c_ispeed = ti->c_ospeed = s; to->c_ispeed = to->c_ospeed = s; } ti->c_cflag |= CLOCAL; to->c_cflag |= CLOCAL; } /* * Standard device routine implementations, mostly meant for * pseudo-terminal device drivers. When a driver creates a new terminal * device class, missing routines are patched. */ static int ttydevsw_defopen(struct tty *tp) { return (0); } static void ttydevsw_defclose(struct tty *tp) { } static void ttydevsw_defoutwakeup(struct tty *tp) { panic("Terminal device has output, while not implemented"); } static void ttydevsw_definwakeup(struct tty *tp) { } static int ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td) { return (ENOIOCTL); } static int ttydevsw_defparam(struct tty *tp, struct termios *t) { /* Use a fake baud rate, we're not a real device. */ t->c_ispeed = t->c_ospeed = TTYDEF_SPEED; return (0); } static int ttydevsw_defmodem(struct tty *tp, int sigon, int sigoff) { /* Simulate a carrier to make the TTY layer happy. */ return (SER_DCD); } static int ttydevsw_defmmap(struct tty *tp, vm_offset_t offset, vm_paddr_t *paddr, int nprot) { return (-1); } static void ttydevsw_defpktnotify(struct tty *tp, char event) { } static void ttydevsw_deffree(void *softc) { panic("Terminal device freed without a free-handler"); } /* * TTY allocation and deallocation. TTY devices can be deallocated when * the driver doesn't use it anymore, when the TTY isn't a session's * controlling TTY and when the device node isn't opened through devfs. */ struct tty * tty_alloc(struct ttydevsw *tsw, void *sc) { return (tty_alloc_mutex(tsw, sc, NULL)); } struct tty * tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex) { struct tty *tp; /* Make sure the driver defines all routines. */ #define PATCH_FUNC(x) do { \ if (tsw->tsw_ ## x == NULL) \ tsw->tsw_ ## x = ttydevsw_def ## x; \ } while (0) PATCH_FUNC(open); PATCH_FUNC(close); PATCH_FUNC(outwakeup); PATCH_FUNC(inwakeup); PATCH_FUNC(ioctl); PATCH_FUNC(param); PATCH_FUNC(modem); PATCH_FUNC(mmap); PATCH_FUNC(pktnotify); PATCH_FUNC(free); #undef PATCH_FUNC tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO); tp->t_devsw = tsw; tp->t_devswsoftc = sc; tp->t_flags = tsw->tsw_flags; tty_init_termios(tp); cv_init(&tp->t_inwait, "ttyin"); cv_init(&tp->t_outwait, "ttyout"); cv_init(&tp->t_outserwait, "ttyosr"); cv_init(&tp->t_bgwait, "ttybg"); cv_init(&tp->t_dcdwait, "ttydcd"); /* Allow drivers to use a custom mutex to lock the TTY. */ if (mutex != NULL) { tp->t_mtx = mutex; } else { tp->t_mtx = &tp->t_mtxobj; mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF); } knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx); sx_xlock(&tty_list_sx); TAILQ_INSERT_TAIL(&tty_list, tp, t_list); tty_list_count++; sx_xunlock(&tty_list_sx); return (tp); } static void tty_dealloc(void *arg) { struct tty *tp = arg; sx_xlock(&tty_list_sx); TAILQ_REMOVE(&tty_list, tp, t_list); tty_list_count--; sx_xunlock(&tty_list_sx); /* Make sure we haven't leaked buffers. */ MPASS(ttyinq_getsize(&tp->t_inq) == 0); MPASS(ttyoutq_getsize(&tp->t_outq) == 0); + seldrain(&tp->t_inpoll); + seldrain(&tp->t_outpoll); knlist_destroy(&tp->t_inpoll.si_note); knlist_destroy(&tp->t_outpoll.si_note); cv_destroy(&tp->t_inwait); cv_destroy(&tp->t_outwait); cv_destroy(&tp->t_bgwait); cv_destroy(&tp->t_dcdwait); cv_destroy(&tp->t_outserwait); if (tp->t_mtx == &tp->t_mtxobj) mtx_destroy(&tp->t_mtxobj); ttydevsw_free(tp); free(tp, M_TTY); } static void tty_rel_free(struct tty *tp) { struct cdev *dev; tty_lock_assert(tp, MA_OWNED); #define TF_ACTIVITY (TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE) if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) { /* TTY is still in use. */ tty_unlock(tp); return; } /* TTY can be deallocated. */ dev = tp->t_dev; tp->t_dev = NULL; tty_unlock(tp); if (dev != NULL) destroy_dev_sched_cb(dev, tty_dealloc, tp); } void tty_rel_pgrp(struct tty *tp, struct pgrp *pg) { MPASS(tp->t_sessioncnt > 0); tty_lock_assert(tp, MA_OWNED); if (tp->t_pgrp == pg) tp->t_pgrp = NULL; tty_unlock(tp); } void tty_rel_sess(struct tty *tp, struct session *sess) { MPASS(tp->t_sessioncnt > 0); /* Current session has left. */ if (tp->t_session == sess) { tp->t_session = NULL; MPASS(tp->t_pgrp == NULL); } tp->t_sessioncnt--; tty_rel_free(tp); } void tty_rel_gone(struct tty *tp) { MPASS(!tty_gone(tp)); /* Simulate carrier removal. */ ttydisc_modem(tp, 0); /* Wake up all blocked threads. */ tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); tp->t_flags |= TF_GONE; tty_rel_free(tp); } /* * Exposing information about current TTY's through sysctl */ static void tty_to_xtty(struct tty *tp, struct xtty *xt) { tty_lock_assert(tp, MA_OWNED); xt->xt_size = sizeof(struct xtty); xt->xt_insize = ttyinq_getsize(&tp->t_inq); xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq); xt->xt_inlc = ttyinq_bytesline(&tp->t_inq); xt->xt_inlow = tp->t_inlow; xt->xt_outsize = ttyoutq_getsize(&tp->t_outq); xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq); xt->xt_outlow = tp->t_outlow; xt->xt_column = tp->t_column; xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0; xt->xt_flags = tp->t_flags; xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : NODEV; } static int sysctl_kern_ttys(SYSCTL_HANDLER_ARGS) { unsigned long lsize; struct xtty *xtlist, *xt; struct tty *tp; int error; sx_slock(&tty_list_sx); lsize = tty_list_count * sizeof(struct xtty); if (lsize == 0) { sx_sunlock(&tty_list_sx); return (0); } xtlist = xt = malloc(lsize, M_TTY, M_WAITOK); TAILQ_FOREACH(tp, &tty_list, t_list) { tty_lock(tp); tty_to_xtty(tp, xt); tty_unlock(tp); xt++; } sx_sunlock(&tty_list_sx); error = SYSCTL_OUT(req, xtlist, lsize); free(xtlist, M_TTY); return (error); } SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs"); /* * Device node creation. Device has been set up, now we can expose it to * the user. */ void tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...) { va_list ap; struct cdev *dev; const char *prefix = "tty"; char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */ uid_t uid; gid_t gid; mode_t mode; /* Remove "tty" prefix from devices like PTY's. */ if (tp->t_flags & TF_NOPREFIX) prefix = ""; va_start(ap, fmt); vsnrprintf(name, sizeof name, 32, fmt, ap); va_end(ap); if (cred == NULL) { /* System device. */ uid = UID_ROOT; gid = GID_WHEEL; mode = S_IRUSR|S_IWUSR; } else { /* User device. */ uid = cred->cr_ruid; gid = GID_TTY; mode = S_IRUSR|S_IWUSR|S_IWGRP; } /* Master call-in device. */ dev = make_dev_cred(&ttydev_cdevsw, 0, cred, uid, gid, mode, "%s%s", prefix, name); dev->si_drv1 = tp; tp->t_dev = dev; /* Slave call-in devices. */ if (tp->t_flags & TF_INITLOCK) { dev = make_dev_cred(&ttyil_cdevsw, 0, cred, uid, gid, mode, "%s%s.init", prefix, name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_init_in; dev = make_dev_cred(&ttyil_cdevsw, 0, cred, uid, gid, mode, "%s%s.lock", prefix, name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_lock_in; } /* Call-out devices. */ if (tp->t_flags & TF_CALLOUT) { dev = make_dev_cred(&ttydev_cdevsw, 0, cred, UID_UUCP, GID_DIALER, 0660, "cua%s", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; /* Slave call-out devices. */ if (tp->t_flags & TF_INITLOCK) { dev = make_dev_cred(&ttyil_cdevsw, 0, cred, UID_UUCP, GID_DIALER, 0660, "cua%s.init", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_init_out; dev = make_dev_cred(&ttyil_cdevsw, 0, cred, UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name); dev_depends(tp->t_dev, dev); dev->si_drv1 = tp; dev->si_drv2 = &tp->t_termios_lock_out; } } } /* * Signalling processes. */ void tty_signal_sessleader(struct tty *tp, int sig) { struct proc *p; tty_lock_assert(tp, MA_OWNED); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; if (tp->t_session != NULL && tp->t_session->s_leader != NULL) { p = tp->t_session->s_leader; PROC_LOCK(p); psignal(p, sig); PROC_UNLOCK(p); } } void tty_signal_pgrp(struct tty *tp, int sig) { ksiginfo_t ksi; tty_lock_assert(tp, MA_OWNED); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO)) tty_info(tp); if (tp->t_pgrp != NULL) { ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, sig, 1, &ksi); PGRP_UNLOCK(tp->t_pgrp); } } void tty_wakeup(struct tty *tp, int flags) { if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL) pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); if (flags & FWRITE) { cv_broadcast(&tp->t_outwait); selwakeup(&tp->t_outpoll); KNOTE_LOCKED(&tp->t_outpoll.si_note, 0); } if (flags & FREAD) { cv_broadcast(&tp->t_inwait); selwakeup(&tp->t_inpoll); KNOTE_LOCKED(&tp->t_inpoll.si_note, 0); } } int tty_wait(struct tty *tp, struct cv *cv) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_wait_sig(cv, tp->t_mtx); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); return (error); } int tty_timedwait(struct tty *tp, struct cv *cv, int hz) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_timedwait_sig(cv, tp->t_mtx, hz); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); return (error); } void tty_flush(struct tty *tp, int flags) { if (flags & FWRITE) { tp->t_flags &= ~TF_HIWAT_OUT; ttyoutq_flush(&tp->t_outq); tty_wakeup(tp, FWRITE); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE); } if (flags & FREAD) { tty_hiwat_in_unblock(tp); ttyinq_flush(&tp->t_inq); ttydevsw_inwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD); } } static int tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; switch (cmd) { /* * Modem commands. * The SER_* and TIOCM_* flags are the same, but one bit * shifted. I don't know why. */ case TIOCSDTR: ttydevsw_modem(tp, SER_DTR, 0); return (0); case TIOCCDTR: ttydevsw_modem(tp, 0, SER_DTR); return (0); case TIOCMSET: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMBIS: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0); return (0); } case TIOCMBIC: { int bits = *(int *)data; ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMGET: *(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1); return (0); case FIOASYNC: if (*(int *)data) tp->t_flags |= TF_ASYNC; else tp->t_flags &= ~TF_ASYNC; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: *(int *)data = ttyinq_bytescanonicalized(&tp->t_inq); return (0); case FIONWRITE: case TIOCOUTQ: *(int *)data = ttyoutq_bytesused(&tp->t_outq); return (0); case FIOSETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Temporarily unlock the TTY to set ownership. */ tty_unlock(tp); error = fsetown(*(int *)data, &tp->t_sigio); tty_lock(tp); return (error); case FIOGETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Get ownership. */ *(int *)data = fgetown(&tp->t_sigio); return (0); case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = tp->t_termios; return (0); case TIOCSETA: case TIOCSETAW: case TIOCSETAF: { struct termios *t = data; /* * Who makes up these funny rules? According to POSIX, * input baud rate is set equal to the output baud rate * when zero. */ if (t->c_ispeed == 0) t->c_ispeed = t->c_ospeed; /* Discard any unsupported bits. */ t->c_iflag &= TTYSUP_IFLAG; t->c_oflag &= TTYSUP_OFLAG; t->c_lflag &= TTYSUP_LFLAG; t->c_cflag &= TTYSUP_CFLAG; /* Set terminal flags through tcsetattr(). */ if (cmd == TIOCSETAW || cmd == TIOCSETAF) { error = tty_drain(tp); if (error) return (error); if (cmd == TIOCSETAF) tty_flush(tp, FREAD); } /* * Only call param() when the flags really change. */ if ((t->c_cflag & CIGNORE) == 0 && (tp->t_termios.c_cflag != t->c_cflag || tp->t_termios.c_ispeed != t->c_ispeed || tp->t_termios.c_ospeed != t->c_ospeed)) { error = ttydevsw_param(tp, t); if (error) return (error); /* XXX: CLOCAL? */ tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE; tp->t_termios.c_ispeed = t->c_ispeed; tp->t_termios.c_ospeed = t->c_ospeed; /* Baud rate has changed - update watermarks. */ tty_watermarks(tp); } /* Copy new non-device driver parameters. */ tp->t_termios.c_iflag = t->c_iflag; tp->t_termios.c_oflag = t->c_oflag; tp->t_termios.c_lflag = t->c_lflag; memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc); ttydisc_optimize(tp); if ((t->c_lflag & ICANON) == 0) { /* * When in non-canonical mode, wake up all * readers. Canonicalize any partial input. VMIN * and VTIME could also be adjusted. */ ttyinq_canonicalize(&tp->t_inq); tty_wakeup(tp, FREAD); } /* * For packet mode: notify the PTY consumer that VSTOP * and VSTART may have been changed. */ if (tp->t_termios.c_iflag & IXON && tp->t_termios.c_cc[VSTOP] == CTRL('S') && tp->t_termios.c_cc[VSTART] == CTRL('Q')) ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP); else ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP); return (0); } case TIOCGETD: /* For compatibility - we only support TTYDISC. */ *(int *)data = TTYDISC; return (0); case TIOCGPGRP: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; return (0); case TIOCGSID: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); MPASS(tp->t_session); *(int *)data = tp->t_session->s_sid; return (0); case TIOCSCTTY: { struct proc *p = td->td_proc; /* XXX: This looks awful. */ tty_unlock(tp); sx_xlock(&proctree_lock); tty_lock(tp); if (!SESS_LEADER(p)) { /* Only the session leader may do this. */ sx_xunlock(&proctree_lock); return (EPERM); } if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ sx_xunlock(&proctree_lock); return (0); } if (p->p_session->s_ttyp != NULL || (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL && tp->t_session->s_ttyvp->v_type != VBAD)) { /* * There is already a relation between a TTY and * a session, or the caller is not the session * leader. * * Allow the TTY to be stolen when the vnode is * invalid, but the reference to the TTY is * still active. This allows immediate reuse of * TTYs of which the session leader has been * killed or the TTY revoked. */ sx_xunlock(&proctree_lock); return (EPERM); } /* Connect the session to the TTY. */ tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; sx_xunlock(&proctree_lock); /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; PROC_LOCK(p); p->p_flag |= P_CONTROLT; PROC_UNLOCK(p); return (0); } case TIOCSPGRP: { struct pgrp *pg; /* * XXX: Temporarily unlock the TTY to locate the process * group. This code would be lot nicer if we would ever * decompose proctree_lock. */ tty_unlock(tp); sx_slock(&proctree_lock); pg = pgfind(*(int *)data); if (pg != NULL) PGRP_UNLOCK(pg); if (pg == NULL || pg->pg_session != td->td_proc->p_session) { sx_sunlock(&proctree_lock); tty_lock(tp); return (EPERM); } tty_lock(tp); /* * Determine if this TTY is the controlling TTY after * relocking the TTY. */ if (!tty_is_ctty(tp, td->td_proc)) { sx_sunlock(&proctree_lock); return (ENOTTY); } tp->t_pgrp = pg; sx_sunlock(&proctree_lock); /* Wake up the background process groups. */ cv_broadcast(&tp->t_bgwait); return (0); } case TIOCFLUSH: { int flags = *(int *)data; if (flags == 0) flags = (FREAD|FWRITE); else flags &= (FREAD|FWRITE); tty_flush(tp, flags); return (0); } case TIOCDRAIN: /* Drain TTY output. */ return tty_drain(tp); case TIOCCONS: /* Set terminal as console TTY. */ if (*(int *)data) { error = priv_check(td, PRIV_TTY_CONSOLE); if (error) return (error); /* * XXX: constty should really need to be locked! * XXX: allow disconnected constty's to be stolen! */ if (constty == tp) return (0); if (constty != NULL) return (EBUSY); tty_unlock(tp); constty_set(tp); tty_lock(tp); } else if (constty == tp) { constty_clear(); } return (0); case TIOCGWINSZ: /* Obtain window size. */ *(struct winsize*)data = tp->t_winsize; return (0); case TIOCSWINSZ: /* Set window size. */ if (bcmp(&tp->t_winsize, data, sizeof(struct winsize)) == 0) return (0); tp->t_winsize = *(struct winsize*)data; tty_signal_pgrp(tp, SIGWINCH); return (0); case TIOCEXCL: tp->t_flags |= TF_EXCLUDE; return (0); case TIOCNXCL: tp->t_flags &= ~TF_EXCLUDE; return (0); case TIOCSTOP: tp->t_flags |= TF_STOPPED; ttydevsw_pktnotify(tp, TIOCPKT_STOP); return (0); case TIOCSTART: tp->t_flags &= ~TF_STOPPED; ttydevsw_outwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_START); return (0); case TIOCSTAT: tty_info(tp); return (0); case TIOCSTI: if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI)) return (EPERM); if (!tty_is_ctty(tp, td->td_proc) && priv_check(td, PRIV_TTY_STI)) return (EACCES); ttydisc_rint(tp, *(char *)data, 0); ttydisc_rint_done(tp); return (0); } #ifdef COMPAT_43TTY return tty_ioctl_compat(tp, cmd, data, fflag, td); #else /* !COMPAT_43TTY */ return (ENOIOCTL); #endif /* COMPAT_43TTY */ } int tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; tty_lock_assert(tp, MA_OWNED); if (tty_gone(tp)) return (ENXIO); error = ttydevsw_ioctl(tp, cmd, data, td); if (error == ENOIOCTL) error = tty_generic_ioctl(tp, cmd, data, fflag, td); return (error); } dev_t tty_udev(struct tty *tp) { if (tp->t_dev) return dev2udev(tp->t_dev); else return NODEV; } int tty_checkoutq(struct tty *tp) { /* 256 bytes should be enough to print a log message. */ return (ttyoutq_bytesleft(&tp->t_outq) >= 256); } void tty_hiwat_in_block(struct tty *tp) { if ((tp->t_flags & TF_HIWAT_IN) == 0 && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) { /* * Input flow control. Only enter the high watermark when we * can successfully store the VSTOP character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTOP], 1) == 0) tp->t_flags |= TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags |= TF_HIWAT_IN; } } void tty_hiwat_in_unblock(struct tty *tp) { if (tp->t_flags & TF_HIWAT_IN && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) { /* * Input flow control. Only leave the high watermark when we * can successfully store the VSTART character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTART], 1) == 0) tp->t_flags &= ~TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags &= ~TF_HIWAT_IN; } if (!tty_gone(tp)) ttydevsw_inwakeup(tp); } /* * TTY hooks interface. */ static int ttyhook_defrint(struct tty *tp, char c, int flags) { if (ttyhook_rint_bypass(tp, &c, 1) != 1) return (-1); return (0); } int ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th, void *softc) { struct tty *tp; struct file *fp; struct cdev *dev; struct cdevsw *cdp; struct filedesc *fdp; int error, ref; /* Validate the file descriptor. */ if ((fdp = p->p_fd) == NULL) return (EBADF); fp = fget_unlocked(fdp, fd); if (fp == NULL) return (EBADF); if (fp->f_ops == &badfileops) { error = EBADF; goto done1; } /* * Make sure the vnode is bound to a character device. * Unlocked check for the vnode type is ok there, because we * only shall prevent calling devvn_refthread on the file that * never has been opened over a character device. */ if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) { error = EINVAL; goto done1; } /* Make sure it is a TTY. */ cdp = devvn_refthread(fp->f_vnode, &dev, &ref); if (cdp == NULL) { error = ENXIO; goto done1; } if (dev != fp->f_data) { error = ENXIO; goto done2; } if (cdp != &ttydev_cdevsw) { error = ENOTTY; goto done2; } tp = dev->si_drv1; /* Try to attach the hook to the TTY. */ error = EBUSY; tty_lock(tp); MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0)); if (tp->t_flags & TF_HOOK) goto done3; tp->t_flags |= TF_HOOK; tp->t_hook = th; tp->t_hooksoftc = softc; *rtp = tp; error = 0; /* Maybe we can switch into bypass mode now. */ ttydisc_optimize(tp); /* Silently convert rint() calls to rint_bypass() when possible. */ if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass)) th->th_rint = ttyhook_defrint; done3: tty_unlock(tp); done2: dev_relthread(dev, ref); done1: fdrop(fp, curthread); return (error); } void ttyhook_unregister(struct tty *tp) { tty_lock_assert(tp, MA_OWNED); MPASS(tp->t_flags & TF_HOOK); /* Disconnect the hook. */ tp->t_flags &= ~TF_HOOK; tp->t_hook = NULL; /* Maybe we need to leave bypass mode. */ ttydisc_optimize(tp); /* Maybe deallocate the TTY as well. */ tty_rel_free(tp); } /* * /dev/console handling. */ static int ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct tty *tp; /* System has no console device. */ if (dev_console_filename == NULL) return (ENXIO); /* Look up corresponding TTY by device name. */ sx_slock(&tty_list_sx); TAILQ_FOREACH(tp, &tty_list, t_list) { if (strcmp(dev_console_filename, tty_devname(tp)) == 0) { dev_console->si_drv1 = tp; break; } } sx_sunlock(&tty_list_sx); /* System console has no TTY associated. */ if (dev_console->si_drv1 == NULL) return (ENXIO); return (ttydev_open(dev, oflags, devtype, td)); } static int ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag) { log_console(uio); return (ttydev_write(dev, uio, ioflag)); } /* * /dev/console is a little different than normal TTY's. When opened, * it determines which TTY to use. When data gets written to it, it * will be logged in the kernel message buffer. */ static struct cdevsw ttyconsdev_cdevsw = { .d_version = D_VERSION, .d_open = ttyconsdev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttyconsdev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttyconsdev", .d_flags = D_TTY, }; static void ttyconsdev_init(void *unused) { dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "console"); } SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL); void ttyconsdev_select(const char *name) { dev_console_filename = name; } /* * Debugging routines. */ #include "opt_ddb.h" #ifdef DDB #include #include static struct { int flag; char val; } ttystates[] = { #if 0 { TF_NOPREFIX, 'N' }, #endif { TF_INITLOCK, 'I' }, { TF_CALLOUT, 'C' }, /* Keep these together -> 'Oi' and 'Oo'. */ { TF_OPENED, 'O' }, { TF_OPENED_IN, 'i' }, { TF_OPENED_OUT, 'o' }, { TF_OPENED_CONS, 'c' }, { TF_GONE, 'G' }, { TF_OPENCLOSE, 'B' }, { TF_ASYNC, 'Y' }, { TF_LITERAL, 'L' }, /* Keep these together -> 'Hi' and 'Ho'. */ { TF_HIWAT, 'H' }, { TF_HIWAT_IN, 'i' }, { TF_HIWAT_OUT, 'o' }, { TF_STOPPED, 'S' }, { TF_EXCLUDE, 'X' }, { TF_BYPASS, 'l' }, { TF_ZOMBIE, 'Z' }, { TF_HOOK, 's' }, /* Keep these together -> 'bi' and 'bo'. */ { TF_BUSY, 'b' }, { TF_BUSY_IN, 'i' }, { TF_BUSY_OUT, 'o' }, { 0, '\0'}, }; #define TTY_FLAG_BITS \ "\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN\5OPENED_OUT\6GONE" \ "\7OPENCLOSE\10ASYNC\11LITERAL\12HIWAT_IN\13HIWAT_OUT\14STOPPED" \ "\15EXCLUDE\16BYPASS\17ZOMBIE\20HOOK" #define DB_PRINTSYM(name, addr) \ db_printf("%s " #name ": ", sep); \ db_printsym((db_addr_t) addr, DB_STGY_ANY); \ db_printf("\n"); static void _db_show_devsw(const char *sep, const struct ttydevsw *tsw) { db_printf("%sdevsw: ", sep); db_printsym((db_addr_t)tsw, DB_STGY_ANY); db_printf(" (%p)\n", tsw); DB_PRINTSYM(open, tsw->tsw_open); DB_PRINTSYM(close, tsw->tsw_close); DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup); DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup); DB_PRINTSYM(ioctl, tsw->tsw_ioctl); DB_PRINTSYM(param, tsw->tsw_param); DB_PRINTSYM(modem, tsw->tsw_modem); DB_PRINTSYM(mmap, tsw->tsw_mmap); DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify); DB_PRINTSYM(free, tsw->tsw_free); } static void _db_show_hooks(const char *sep, const struct ttyhook *th) { db_printf("%shook: ", sep); db_printsym((db_addr_t)th, DB_STGY_ANY); db_printf(" (%p)\n", th); if (th == NULL) return; DB_PRINTSYM(rint, th->th_rint); DB_PRINTSYM(rint_bypass, th->th_rint_bypass); DB_PRINTSYM(rint_done, th->th_rint_done); DB_PRINTSYM(rint_poll, th->th_rint_poll); DB_PRINTSYM(getc_inject, th->th_getc_inject); DB_PRINTSYM(getc_capture, th->th_getc_capture); DB_PRINTSYM(getc_poll, th->th_getc_poll); DB_PRINTSYM(close, th->th_close); } static void _db_show_termios(const char *name, const struct termios *t) { db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x " "lflag 0x%x ispeed %u ospeed %u\n", name, t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag, t->c_ispeed, t->c_ospeed); } /* DDB command to show TTY statistics. */ DB_SHOW_COMMAND(tty, db_show_tty) { struct tty *tp; if (!have_addr) { db_printf("usage: show tty \n"); return; } tp = (struct tty *)addr; db_printf("0x%p: %s\n", tp, tty_devname(tp)); db_printf("\tmtx: %p\n", tp->t_mtx); db_printf("\tflags: %b\n", tp->t_flags, TTY_FLAG_BITS); db_printf("\trevokecnt: %u\n", tp->t_revokecnt); /* Buffering mechanisms. */ db_printf("\tinq: %p begin %u linestart %u reprint %u end %u " "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin, tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end, tp->t_inq.ti_nblocks, tp->t_inq.ti_quota); db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n", &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end, tp->t_outq.to_nblocks, tp->t_outq.to_quota); db_printf("\tinlow: %zu\n", tp->t_inlow); db_printf("\toutlow: %zu\n", tp->t_outlow); _db_show_termios("\ttermios", &tp->t_termios); db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n", tp->t_winsize.ws_row, tp->t_winsize.ws_col, tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel); db_printf("\tcolumn: %u\n", tp->t_column); db_printf("\twritepos: %u\n", tp->t_writepos); db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags); /* Init/lock-state devices. */ _db_show_termios("\ttermios_init_in", &tp->t_termios_init_in); _db_show_termios("\ttermios_init_out", &tp->t_termios_init_out); _db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in); _db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out); /* Hooks */ _db_show_devsw("\t", tp->t_devsw); _db_show_hooks("\t", tp->t_hook); /* Process info. */ db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp, tp->t_pgrp ? tp->t_pgrp->pg_id : 0, tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0); db_printf("\tsession: %p", tp->t_session); if (tp->t_session != NULL) db_printf(" count %u leader %p tty %p sid %d login %s", tp->t_session->s_count, tp->t_session->s_leader, tp->t_session->s_ttyp, tp->t_session->s_sid, tp->t_session->s_login); db_printf("\n"); db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt); db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc); db_printf("\thooksoftc: %p\n", tp->t_hooksoftc); db_printf("\tdev: %p\n", tp->t_dev); } /* DDB command to list TTYs. */ DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys) { struct tty *tp; size_t isiz, osiz; int i, j; /* Make the output look like `pstat -t'. */ db_printf("PTR "); #if defined(__LP64__) db_printf(" "); #endif db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW " "COL SESS PGID STATE\n"); TAILQ_FOREACH(tp, &tty_list, t_list) { isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE; osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE; db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d %5d ", tp, tty_devname(tp), isiz, tp->t_inq.ti_linestart - tp->t_inq.ti_begin, tp->t_inq.ti_end - tp->t_inq.ti_linestart, isiz - tp->t_inlow, osiz, tp->t_outq.to_end - tp->t_outq.to_begin, osiz - tp->t_outlow, MIN(tp->t_column, 99999), tp->t_session ? tp->t_session->s_sid : 0, tp->t_pgrp ? tp->t_pgrp->pg_id : 0); /* Flag bits. */ for (i = j = 0; ttystates[i].flag; i++) if (tp->t_flags & ttystates[i].flag) { db_printf("%c", ttystates[i].val); j++; } if (j == 0) db_printf("-"); db_printf("\n"); } } #endif /* DDB */ Index: stable/8/sys/kern/tty_pts.c =================================================================== --- stable/8/sys/kern/tty_pts.c (revision 225584) +++ stable/8/sys/kern/tty_pts.c (revision 225585) @@ -1,869 +1,871 @@ /*- * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_tty.h" /* Add compatibility bits for FreeBSD. */ #define PTS_COMPAT #ifdef DEV_PTY /* Add /dev/ptyXX compat bits. */ #define PTS_EXTERNAL #endif /* DEV_PTY */ /* Add bits to make Linux binaries work. */ #define PTS_LINUX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Our utmp(5) format is limited to 8-byte TTY line names. This means * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow * users to increase this number, assuming they have manually increased * UT_LINESIZE. */ static struct unrhdr *pts_pool; static unsigned int pts_maxdev = 999; SYSCTL_UINT(_kern, OID_AUTO, pts_maxdev, CTLFLAG_RW, &pts_maxdev, 0, "Maximum amount of pts(4) pseudo-terminals"); static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device"); /* * Per-PTS structure. * * List of locks * (t) locked by tty_lock() * (c) const until freeing */ struct pts_softc { int pts_unit; /* (c) Device unit number. */ unsigned int pts_flags; /* (t) Device flags. */ #define PTS_PKT 0x1 /* Packet mode. */ #define PTS_FINISHED 0x2 /* Return errors on read()/write(). */ char pts_pkt; /* (t) Unread packet mode data. */ struct cv pts_inwait; /* (t) Blocking write() on master. */ struct selinfo pts_inpoll; /* (t) Select queue for write(). */ struct cv pts_outwait; /* (t) Blocking read() on master. */ struct selinfo pts_outpoll; /* (t) Select queue for read(). */ #ifdef PTS_EXTERNAL struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ struct uidinfo *pts_uidinfo; /* (c) Resource limit. */ }; /* * Controller-side file operations. */ static int ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; char pkt; if (uio->uio_resid == 0) return (0); tty_lock(tp); for (;;) { /* * Implement packet mode. When packet mode is turned on, * the first byte contains a bitmask of events that * occured (start, stop, flush, window size, etc). */ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) { pkt = psc->pts_pkt; psc->pts_pkt = 0; tty_unlock(tp); error = ureadc(pkt, uio); return (error); } /* * Transmit regular data. * * XXX: We shouldn't use ttydisc_getc_poll()! Even * though in this implementation, there is likely going * to be data, we should just call ttydisc_getc_uio() * and use its return value to sleep. */ if (ttydisc_getc_poll(tp)) { if (psc->pts_flags & PTS_PKT) { /* * XXX: Small race. Fortunately PTY * consumers aren't multithreaded. */ tty_unlock(tp); error = ureadc(TIOCPKT_DATA, uio); if (error) return (error); tty_lock(tp); } error = ttydisc_getc_uio(tp, uio); break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) break; /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx); if (error != 0) break; } tty_unlock(tp); return (error); } static int ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); char ib[256], *ibstart; size_t iblen, rintlen; int error = 0; if (uio->uio_resid == 0) return (0); for (;;) { ibstart = ib; iblen = MIN(uio->uio_resid, sizeof ib); error = uiomove(ib, iblen, uio); tty_lock(tp); if (error != 0) { iblen = 0; goto done; } /* * When possible, avoid the slow path. rint_bypass() * copies all input to the input queue at once. */ MPASS(iblen > 0); do { if (ttydisc_can_bypass(tp)) { /* Store data at once. */ rintlen = ttydisc_rint_bypass(tp, ibstart, iblen); ibstart += rintlen; iblen -= rintlen; if (iblen == 0) { /* All data written. */ break; } } else { error = ttydisc_rint(tp, *ibstart, 0); if (error == 0) { /* Character stored successfully. */ ibstart++; iblen--; continue; } } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) { error = EIO; goto done; } /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; goto done; } /* Wake up users on the slave side. */ ttydisc_rint_done(tp); error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx); if (error != 0) goto done; } while (iblen > 0); if (uio->uio_resid == 0) break; tty_unlock(tp); } done: ttydisc_rint_done(tp); tty_unlock(tp); /* * Don't account for the part of the buffer that we couldn't * pass to the TTY. */ uio->uio_resid += iblen; return (error); } static int ptsdev_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int ptsdev_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0, sig; switch (cmd) { case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Force read() to be called. */ *(int *)data = 1; } else { *(int *)data = ttydisc_getc_poll(tp); } tty_unlock(tp); return (0); case FIODGNAME: { struct fiodgname_arg *fgn; const char *p; int i; /* Reverse device name lookups, for ptsname() and ttyname(). */ fgn = data; p = tty_devname(tp); i = strlen(p) + 1; if (i > fgn->len) return (EINVAL); return copyout(p, fgn->buf, i); } /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if * the terminal is the foreground terminal of the calling * process. * * TIOCGETA is also implemented here. Various Linux PTY routines * often call isatty(), which is implemented by tcgetattr(). */ #ifdef PTS_LINUX case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ tty_lock(tp); *(struct termios*)data = tp->t_termios; tty_unlock(tp); return (0); #endif /* PTS_LINUX */ case TIOCSETAF: case TIOCSETAW: /* * We must make sure we turn tcsetattr() calls of TCSAFLUSH and * TCSADRAIN into something different. If an application would * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may * deadlock waiting for all data to be read. */ cmd = TIOCSETA; break; #if defined(PTS_COMPAT) || defined(PTS_LINUX) case TIOCGPTN: /* * Get the device unit number. */ if (psc->pts_unit < 0) return (ENOTTY); *(unsigned int *)data = psc->pts_unit; return (0); #endif /* PTS_COMPAT || PTS_LINUX */ case TIOCGPGRP: /* Get the foreground process group ID. */ tty_lock(tp); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; tty_unlock(tp); return (0); case TIOCGSID: /* Get the session leader process ID. */ tty_lock(tp); if (tp->t_session == NULL) error = ENOTTY; else *(int *)data = tp->t_session->s_sid; tty_unlock(tp); return (error); case TIOCPTMASTER: /* Yes, we are a pseudo-terminal master. */ return (0); case TIOCSIG: /* Signal the foreground process group. */ sig = *(int *)data; if (sig < 1 || sig >= NSIG) return (EINVAL); tty_lock(tp); tty_signal_pgrp(tp, sig); tty_unlock(tp); return (0); case TIOCPKT: /* Enable/disable packet mode. */ tty_lock(tp); if (*(int *)data) psc->pts_flags |= PTS_PKT; else psc->pts_flags &= ~PTS_PKT; tty_unlock(tp); return (0); } /* Just redirect this ioctl to the slave device. */ tty_lock(tp); error = tty_ioctl(tp, cmd, data, fp->f_flag, td); tty_unlock(tp); if (error == ENOIOCTL) error = ENOTTY; return (error); } static int ptsdev_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int revents = 0; tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Slave device is not opened. */ tty_unlock(tp); return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); } if (events & (POLLIN|POLLRDNORM)) { /* See if we can getc something. */ if (ttydisc_getc_poll(tp) || (psc->pts_flags & PTS_PKT && psc->pts_pkt)) revents |= events & (POLLIN|POLLRDNORM); } if (events & (POLLOUT|POLLWRNORM)) { /* See if we can rint something. */ if (ttydisc_rint_poll(tp)) revents |= events & (POLLOUT|POLLWRNORM); } /* * No need to check for POLLHUP here. This device cannot be used * as a callout device, which means we always have a carrier, * because the master is. */ if (revents == 0) { /* * This code might look misleading, but the naming of * poll events on this side is the opposite of the slave * device. */ if (events & (POLLIN|POLLRDNORM)) selrecord(td, &psc->pts_outpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &psc->pts_inpoll); } tty_unlock(tp); return (revents); } /* * kqueue support. */ static void pts_kqops_read_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_outpoll.si_note, kn, 0); } static int pts_kqops_read_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_getc_poll(tp); return (kn->kn_data > 0); } } static void pts_kqops_write_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_inpoll.si_note, kn, 0); } static int pts_kqops_write_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_rint_poll(tp); return (kn->kn_data > 0); } } static struct filterops pts_kqops_read = { 1, NULL, pts_kqops_read_detach, pts_kqops_read_event }; static struct filterops pts_kqops_write = { 1, NULL, pts_kqops_write_detach, pts_kqops_write_event }; static int ptsdev_kqfilter(struct file *fp, struct knote *kn) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pts_kqops_read; knlist_add(&psc->pts_outpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_fop = &pts_kqops_write; knlist_add(&psc->pts_inpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static int ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; #ifdef PTS_EXTERNAL struct pts_softc *psc = tty_softc(tp); #endif /* PTS_EXTERNAL */ struct cdev *dev = tp->t_dev; /* * According to POSIX, we must implement an fstat(). This also * makes this implementation compatible with Linux binaries, * because Linux calls fstat() on the pseudo-terminal master to * obtain st_rdev. * * XXX: POSIX also mentions we must fill in st_dev, but how? */ bzero(sb, sizeof *sb); #ifdef PTS_EXTERNAL if (psc->pts_cdev != NULL) sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev); else #endif /* PTS_EXTERNAL */ sb->st_ino = sb->st_rdev = tty_udev(tp); sb->st_atimespec = dev->si_atime; sb->st_ctimespec = dev->si_ctime; sb->st_mtimespec = dev->si_mtime; sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; return (0); } static int ptsdev_close(struct file *fp, struct thread *td) { struct tty *tp = fp->f_data; /* Deallocate TTY device. */ tty_lock(tp); tty_rel_gone(tp); /* * Open of /dev/ptmx or /dev/ptyXX changes the type of file * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode * use count, we need to decrement it, and possibly do other * required cleanup. */ if (fp->f_vnode != NULL) return (vnops.fo_close(fp, td)); return (0); } static struct fileops ptsdev_ops = { .fo_read = ptsdev_read, .fo_write = ptsdev_write, .fo_truncate = ptsdev_truncate, .fo_ioctl = ptsdev_ioctl, .fo_poll = ptsdev_poll, .fo_kqfilter = ptsdev_kqfilter, .fo_stat = ptsdev_stat, .fo_close = ptsdev_close, .fo_flags = DFLAG_PASSABLE, }; /* * Driver-side hooks. */ static void ptsdrv_outwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_outwait); selwakeup(&psc->pts_outpoll); KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0); } static void ptsdrv_inwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_inwait); selwakeup(&psc->pts_inpoll); KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0); } static int ptsdrv_open(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); psc->pts_flags &= ~PTS_FINISHED; return (0); } static void ptsdrv_close(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); /* Wake up any blocked readers/writers. */ psc->pts_flags |= PTS_FINISHED; ptsdrv_outwakeup(tp); ptsdrv_inwakeup(tp); } static void ptsdrv_pktnotify(struct tty *tp, char event) { struct pts_softc *psc = tty_softc(tp); /* * Clear conflicting flags. */ switch (event) { case TIOCPKT_STOP: psc->pts_pkt &= ~TIOCPKT_START; break; case TIOCPKT_START: psc->pts_pkt &= ~TIOCPKT_STOP; break; case TIOCPKT_NOSTOP: psc->pts_pkt &= ~TIOCPKT_DOSTOP; break; case TIOCPKT_DOSTOP: psc->pts_pkt &= ~TIOCPKT_NOSTOP; break; } psc->pts_pkt |= event; ptsdrv_outwakeup(tp); } static void ptsdrv_free(void *softc) { struct pts_softc *psc = softc; /* Make device number available again. */ if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); chgptscnt(psc->pts_uidinfo, -1, 0); uifree(psc->pts_uidinfo); + seldrain(&psc->pts_inpoll); + seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); #ifdef PTS_EXTERNAL /* Destroy master device as well. */ if (psc->pts_cdev != NULL) destroy_dev_sched(psc->pts_cdev); #endif /* PTS_EXTERNAL */ free(psc, M_PTS); } static struct ttydevsw pts_class = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = ptsdrv_outwakeup, .tsw_inwakeup = ptsdrv_inwakeup, .tsw_open = ptsdrv_open, .tsw_close = ptsdrv_close, .tsw_pktnotify = ptsdrv_pktnotify, .tsw_free = ptsdrv_free, }; static int pts_alloc(int fflags, struct thread *td, struct file *fp) { int unit, ok; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct uidinfo *uid = td->td_ucred->cr_ruidinfo; /* Resource limiting. */ PROC_LOCK(p); ok = chgptscnt(uid, 1, lim_cur(p, RLIMIT_NPTS)); PROC_UNLOCK(p); if (!ok) return (EAGAIN); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { chgptscnt(uid, -1, 0); return (EAGAIN); } if (unit > pts_maxdev) { free_unr(pts_pool, unit); chgptscnt(uid, -1, 0); return (EAGAIN); } /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "pts inwait"); cv_init(&psc->pts_outwait, "pts outwait"); psc->pts_unit = unit; psc->pts_uidinfo = uid; uihold(uid); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #ifdef PTS_EXTERNAL int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { int ok; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct uidinfo *uid = td->td_ucred->cr_ruidinfo; /* Resource limiting. */ PROC_LOCK(p); ok = chgptscnt(uid, 1, lim_cur(p, RLIMIT_NPTS)); PROC_UNLOCK(p); if (!ok) return (EAGAIN); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "pts inwait"); cv_init(&psc->pts_outwait, "pts outwait"); psc->pts_unit = -1; psc->pts_cdev = dev; psc->pts_uidinfo = uid; uihold(uid); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "%s", name); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #endif /* PTS_EXTERNAL */ int posix_openpt(struct thread *td, struct posix_openpt_args *uap) { int error, fd; struct file *fp; /* * POSIX states it's unspecified when other flags are passed. We * don't allow this. */ if (uap->flags & ~(O_RDWR|O_NOCTTY)) return (EINVAL); error = falloc(td, &fp, &fd); if (error) return (error); /* Allocate the actual pseudo-TTY. */ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp); if (error != 0) { fdclose(td->td_proc->p_fd, fp, fd, td); return (error); } /* Pass it back to userspace. */ td->td_retval[0] = fd; fdrop(fp, td); return (0); } #if defined(PTS_COMPAT) || defined(PTS_LINUX) static int ptmx_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *fp) { return (pts_alloc(fflags & (FREAD|FWRITE), td, fp)); } static struct cdevsw ptmx_cdevsw = { .d_version = D_VERSION, .d_fdopen = ptmx_fdopen, .d_name = "ptmx", }; #endif /* PTS_COMPAT || PTS_LINUX */ static void pts_init(void *unused) { pts_pool = new_unrhdr(0, INT_MAX, NULL); #if defined(PTS_COMPAT) || defined(PTS_LINUX) make_dev(&ptmx_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "ptmx"); #endif /* PTS_COMPAT || PTS_LINUX */ } SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL); Index: stable/8/sys/kern/uipc_mqueue.c =================================================================== --- stable/8/sys/kern/uipc_mqueue.c (revision 225584) +++ stable/8/sys/kern/uipc_mqueue.c (revision 225585) @@ -1,2757 +1,2759 @@ /*- * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; u_int32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { 1, NULL, filt_mqdetach, filt_mqread }; struct filterops mq_wfiltops = { 1, NULL, filt_mqdetach, filt_mqwrite }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vrecycle(vp, curthread); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len) { struct mqfs_node *pn; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { if (strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp, ap->a_td); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); + seldrain(&mq->mq_rsel); + seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct proc *p; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; PROC_LOCK(p); if (!KSI_ONQ(&nt->nt_ksi)) psignal_event(p, &nt->nt_sigev, &nt->nt_ksi); PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; fdp = td->td_proc->p_fd; cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); error = falloc(td, &fp, &fd); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(fdp, fp, fd, td); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[fd] == fp) fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error != 0) return (error); if (uap->oattr != NULL) error = copyout(&oattr, uap->oattr, sizeof(oattr)); return (error); } int kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev; struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp; struct mqueue_notifier *nt, *newnt = NULL; int error; p = td->td_proc; fdp = td->td_proc->p_fd; if (uap->sigev) { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error) return (error); if (ev.sigev_notify != SIGEV_SIGNAL && ev.sigev_notify != SIGEV_THREAD_ID && ev.sigev_notify != SIGEV_NONE) return (EINVAL); if ((ev.sigev_notify == SIGEV_SIGNAL || ev.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(ev.sigev_signo)) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); if (fget_locked(fdp, uap->mqd) != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (uap->sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, uap->mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = uap->mqd; notifier_insert(p, nt); } nt->nt_sigev = ev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, uap->mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int mqf_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); st->st_atimespec = pn->mn_atime; st->st_mtimespec = pn->mn_mtime; st->st_ctimespec = pn->mn_ctime; st->st_birthtimespec = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; return (0); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static struct fileops mqueueops = { .fo_read = mqf_read, .fo_write = mqf_write, .fo_truncate = mqf_truncate, .fo_ioctl = mqf_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER(kmq_setattr), SYSCALL_INIT_HELPER(kmq_timedsend), SYSCALL_INIT_HELPER(kmq_timedreceive), SYSCALL_INIT_HELPER(kmq_notify), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error != 0) return (error); if (uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr), SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend), SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive), SYSCALL32_INIT_HELPER(kmq_notify), SYSCALL32_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); Index: stable/8/sys/kern/uipc_socket.c =================================================================== --- stable/8/sys/kern/uipc_socket.c (revision 225584) +++ stable/8/sys/kern/uipc_socket.c (revision 225585) @@ -1,3639 +1,3641 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_zero.h" #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_solisten(struct knote *kn, long hint); static struct filterops solisten_filtops = { 1, NULL, filt_sordetach, filt_solisten }; static struct filterops soread_filtops = { 1, NULL, filt_sordetach, filt_soread }; static struct filterops sowrite_filtops = { 1, NULL, filt_sowdetach, filt_sowrite }; uma_zone_t socket_zone; so_gen_t so_gencnt; /* generation count for sockets */ int maxsockets; MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); static int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS); /* XXX: we dont have SYSCTL_USHORT */ SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection " "queue size"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); #ifdef ZERO_COPY_SOCKETS /* These aren't static because they're used in other files. */ int so_zero_copy_send = 1; int so_zero_copy_receive = 1; SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, "Zero copy controls"); SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, &so_zero_copy_receive, 0, "Enable zero copy receive"); SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, &so_zero_copy_send, 0, "Enable zero copy send"); #endif /* ZERO_COPY_SOCKETS */ /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets) { maxsockets = newmaxsockets; if (maxsockets > ((maxfiles / 4) * 3)) { maxfiles = (maxsockets * 5) / 4; maxfilesperproc = (maxfiles * 9) / 10; } EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets avaliable"); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_aiojobq); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); vnet->vnet_sockcnt++; so->so_vnet = vnet; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); #ifdef INET /* remove acccept filter if one is present. */ if (so->so_accf != NULL) do_setopt_accept_filter(so, NULL); #endif #ifdef MAC mac_socket_destroy(so); #endif crfree(so->so_cred); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); so->so_count = 1; /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); so->so_count = 0; sodealloc(so); return (error); } *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, propoerly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; int over; ACCEPT_LOCK(); over = (head->so_qlen > 3 * head->so_qlimit / 2); ACCEPT_UNLOCK(); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) #else if (over) #endif return (NULL); VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) return (NULL); if ((head->so_options & SO_ACCEPTFILTER) != 0) connstatus = 0; so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; so->so_state |= connstatus; ACCEPT_LOCK(); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_qstate |= SQ_COMP; head->so_qlen++; } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); TAILQ_REMOVE(&head->so_incomp, sp, so_list); head->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; head->so_incqlen++; } ACCEPT_UNLOCK(); if (connstatus) { sorwakeup(head); wakeup_one(&head->so_timeo); } return (so); } int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return error; } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return error; } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { SOCK_LOCK_ASSERT(so); if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; so->so_options |= SO_ACCEPTCONN; } /* * Attempt to free a socket. This should really be sotryfree(). * * sofree() will succeed if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. * * Otherwise, it will quietly abort so that a future call to sofree(), when * conditions are right, can succeed. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; struct socket *head; ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { SOCK_UNLOCK(so); ACCEPT_UNLOCK(); return; } head = so->so_head; if (head != NULL) { KASSERT((so->so_qstate & SQ_COMP) != 0 || (so->so_qstate & SQ_INCOMP) != 0, ("sofree: so_head != NULL, but neither SQ_COMP nor " "SQ_INCOMP")); KASSERT((so->so_qstate & SQ_COMP) == 0 || (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; so->so_head = NULL; } KASSERT((so->so_qstate & SQ_COMP) == 0 && (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); if (so->so_options & SO_ACCEPTCONN) { KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); } SOCK_UNLOCK(so); ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbrelease_internal() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); + seldrain(&so->so_snd.sb_sel); + seldrain(&so->so_rcv.sb_sel); knlist_destroy(&so->so_rcv.sb_sel.si_note); knlist_destroy(&so->so_snd.sb_sel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); if (so->so_options & SO_ACCEPTCONN) { struct socket *sp; ACCEPT_LOCK(); while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { TAILQ_REMOVE(&so->so_incomp, sp, so_list); so->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; sp->so_qstate &= ~SQ_COMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } ACCEPT_UNLOCK(); } ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #ifdef ZERO_COPY_SOCKETS struct so_zerocopy_stats{ int size_ok; int align_ok; int found_ifp; }; struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; #include #include #include #include #include #include /* * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise * sosend_dgram() and sosend_generic() use m_uiotombuf(). * * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or * all of the data referenced by the uio. If desired, it uses zero-copy. * *space will be updated to reflect data copied in. * * NB: If atomic I/O is requested, the caller must already have checked that * space can hold resid bytes. * * NB: In the event of an error, the caller may need to free the partial * chain pointed to by *mpp. The contents of both *uio and *space may be * modified even in the case of an error. */ static int sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, int flags) { struct mbuf *m, **mp, *top; long len, resid; int error; #ifdef ZERO_COPY_SOCKETS int cow_send; #endif *retmp = top = NULL; mp = ⊤ len = 0; resid = uio->uio_resid; error = 0; do { #ifdef ZERO_COPY_SOCKETS cow_send = 0; #endif /* ZERO_COPY_SOCKETS */ if (resid >= MINCLSIZE) { #ifdef ZERO_COPY_SOCKETS if (top == NULL) { m = m_gethdr(M_WAITOK, MT_DATA); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = NULL; } else m = m_get(M_WAITOK, MT_DATA); if (so_zero_copy_send && resid>=PAGE_SIZE && *space>=PAGE_SIZE && uio->uio_iov->iov_len>=PAGE_SIZE) { so_zerocp_stats.size_ok++; so_zerocp_stats.align_ok++; cow_send = socow_setup(m, uio); len = cow_send; } if (!cow_send) { m_clget(m, M_WAITOK); len = min(min(MCLBYTES, resid), *space); } #else /* ZERO_COPY_SOCKETS */ if (top == NULL) { m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = NULL; } else m = m_getcl(M_WAIT, MT_DATA, 0); len = min(min(MCLBYTES, resid), *space); #endif /* ZERO_COPY_SOCKETS */ } else { if (top == NULL) { m = m_gethdr(M_WAIT, MT_DATA); m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = NULL; len = min(min(MHLEN, resid), *space); /* * For datagram protocols, leave room * for protocol headers in first mbuf. */ if (atomic && m && len < MHLEN) MH_ALIGN(m, len); } else { m = m_get(M_WAIT, MT_DATA); len = min(min(MLEN, resid), *space); } } if (m == NULL) { error = ENOBUFS; goto out; } *space -= len; #ifdef ZERO_COPY_SOCKETS if (cow_send) error = 0; else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, void *), (int)len, uio); resid = uio->uio_resid; m->m_len = len; *mp = m; top->m_pkthdr.len += len; if (error) goto out; mp = &m->m_next; if (resid <= 0) { if (flags & MSG_EOR) top->m_flags |= M_EOR; break; } } while (*space > 0 && atomic); out: *retmp = top; return (error); } #endif /*ZERO_COPY_SOCKETS*/ #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space, resid; int clen = 0, error, dontroute; #ifdef ZERO_COPY_SOCKETS int atomic = sosendallatonce(so) || top; #endif KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sodgram_send: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { #ifdef ZERO_COPY_SOCKETS error = sosend_copyin(uio, &top, atomic, &space, flags); if (error) goto out; #else /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; #endif resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have recieved a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space, resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { #ifdef ZERO_COPY_SOCKETS error = sosend_copyin(uio, &top, atomic, &space, flags); if (error != 0) goto release; #else /* * Copy the data from userland into a mbuf * chain. If no data is to be copied in, * a single empty mbuf is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; #endif resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have recieved * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { int disposable; if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_DISPOSABLE)) disposable = 1; else disposable = 0; error = uiomoveco(mtod(m, void *), min(uio->uio_resid, m->m_len), uio, disposable); } else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, len, error, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; int orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_WAITALL is set, and it is possible to do the entire * receive operation at once if we block (resid <= hiwat). * 3. MSG_DONTWAIT is not set * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning a * short count if a timeout or signal occurs after we start. */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !so->so_rcv.sb_cc, ("receive: m == %p so->so_rcv.sb_cc == %u", m, so->so_rcv.sb_cc)); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m == NULL) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else goto dontblock; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copy(m, 0, m->m_len); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); #ifdef ZERO_COPY_SOCKETS if (so_zero_copy_receive) { int disposable; if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_DISPOSABLE)) disposable = 1; else disposable = 0; error = uiomoveco(mtod(m, char *) + moff, (int)len, uio, disposable); } else #endif /* ZERO_COPY_SOCKETS */ error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (m && pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { int copy_flag; if (flags & MSG_DONTWAIT) copy_flag = M_DONTWAIT; else copy_flag = M_WAIT; if (copy_flag == M_WAIT) SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, copy_flag); if (copy_flag == M_WAIT) SOCKBUF_LOCK(&so->so_rcv); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. Adjust * uio_resid back (it was * adjusted down by len * bytes, which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { /* When disconnecting there may be still some data left. */ if (sb->sb_cc > 0) goto deliver; if (!(so->so_state & SS_ISDISCONNECTED)) error = ENOTCONN; goto out; } /* Socket buffer is empty and we shall not block. */ if (sb->sb_cc == 0 && ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sb->sb_cc > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sb->sb_cc > 0) goto deliver; else goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sb->sb_cc >= sb->sb_lowat || sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sb->sb_cc); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { for (*mp0 = m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } sb->sb_mb = m; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); n->m_next = NULL; } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= m->m_len; if (*mp0 != NULL) n->m_next = m; else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, len, error; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(so->so_rcv.sb_cc == 0, ("soreceive_dgram: sb_mb NULL but sb_cc %u", so->so_rcv.sb_cc)); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) flags |= MSG_TRUNC; m_freem(m); if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) { (*pr->pr_usrreqs->pru_flush)(so, how); } if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); CURVNET_RESTORE(); return (error); } CURVNET_RESTORE(); return (0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct sockbuf asb; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&asb, offsetof(struct sockbuf, sb_startzero)); bcopy(&sb->sb_startzero, &asb.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the socket buffer. Don't call * any unsafe routines (that rely on locks being initialized) on asb. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease_internal(&asb, so); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; u_long val; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { error = (*so->so_proto->pr_ctloutput)(so, sopt); CURVNET_RESTORE(); return (error); } error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: error = do_setopt_accept_filter(so, sopt); if (error) goto bad; break; #endif case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (optval < 0 || optval > rt_numfibs) { error = EINVAL; goto bad; } if (so->so_proto != NULL && ((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) { so->so_fibnum = optval; /* Note: ignore error */ if (so->so_proto->pr_ctloutput) (*so->so_proto->pr_ctloutput)(so, sopt); } else { so->so_fibnum = 0; } break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: if (sbreserve(sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv, (u_long)optval, so, curthread) == 0) { error = ENOBUFS; goto bad; } (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; break; /* * Make sure the low-water is never greater than the * high-water. */ case SO_SNDLOWAT: SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_lowat = (optval > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_snd); break; case SO_RCVLOWAT: SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_lowat = (optval > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_rcv); break; } break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; /* assert(hz > 0); */ if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } /* assert(tick > 0); */ /* assert(ULONG_MAX - INT_MAX >= 1000000); */ val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; if (val > INT_MAX) { error = EDOM; goto bad; } if (val == 0 && tv.tv_usec != 0) val = 1; switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; default: error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { (void) ((*so->so_proto->pr_ctloutput) (so, sopt)); } } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { #ifdef INET case SO_ACCEPTFILTER: error = do_getopt_accept_filter(so, sopt); break; #endif case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_TYPE: optval = so->so_type; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: optval = (sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); tv.tv_sec = optval / hz; tv.tv_usec = (optval % hz) * tick; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = so->so_qlimit; goto integer; case SO_LISTENQLEN: optval = so->so_qlen; goto integer; case SO_LISTENINCQLEN: optval = so->so_incqlen; goto integer; default: error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAIT : M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rcv.sb_sel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rcv.sb_sel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_snd.sb_sel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; sb = &so->so_snd; break; default: return (EINVAL); } SOCKBUF_LOCK(sb); knlist_add(&sb->sb_sel.si_note, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_rcv); knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_rcv); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_snd); knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_snd); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } /*ARGSUSED*/ static int filt_solisten(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; kn->kn_data = so->so_qlen; return (! TAILQ_EMPTY(&so->so_comp)); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); if (val < 1 || val > USHRT_MAX) return (EINVAL); somaxconn = val; return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { struct socket *head; int ret; restart: ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { SOCK_UNLOCK(so); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); head->so_qlen++; so->so_qstate |= SQ_COMP; ACCEPT_UNLOCK(); sorwakeup(head); wakeup_one(&head->so_timeo); } else { ACCEPT_UNLOCK(); soupcall_set(so, SO_RCV, head->so_accf->so_accept_filter->accf_callback, head->so_accf->so_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->so_accf->so_accept_filter->accf_callback(so, head->so_accf->so_accept_filter_arg, M_DONTWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); if (ret == SU_ISCONNECTED) goto restart; } return; } SOCK_UNLOCK(so); ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { /* * Note: This code assumes that SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) are the same. */ SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { /* * Note: This code assumes that SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) are the same. */ SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); sowwakeup_locked(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg) { struct sockbuf *sb; switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); #if 0 /* XXX: accf_http actually wants to do this on purpose. */ KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); #endif sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_qlen = so->so_qlen; xso->so_incqlen = so->so_incqlen; xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; } /* * Socket accessor functions to provide external consumers with * a safe interface to socket state * */ void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg) { TAILQ_FOREACH(so, &so->so_comp, so_list) func(so, arg); } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } Index: stable/8/sys/kern/vfs_subr.c =================================================================== --- stable/8/sys/kern/vfs_subr.c (revision 225584) +++ stable/8/sys/kern/vfs_subr.c (revision 225585) @@ -1,4403 +1,4404 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SW_WATCHDOG #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define WI_MPSAFEQ 0 #define WI_GIANTQ 1 static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed * vnode. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of vnodes in the free list"); static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "Number of calls to reassignbuf"); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending[2]; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of physical pages to vnodes approaches sixteen to one. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) #endif static void vntblinit(void *dummy __unused) { int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to physical pages * is one to four until desiredvnodes exceeds 98,304. Thereafter, the * marginal ratio of desiredvnodes to physical pages is one to * sixteen. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * may not exceed one seventh of the kernel's heap size. */ physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, cnt.v_page_count) / 16; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Initialize the filesystem syncer. */ syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); MNT_ILOCK(mp); MNT_REF(mp); /* * If mount point is currenly being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. */ struct mount * vfs_busyfs(fsid_t *fsid) { struct mount *mp; int error; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { mtx_unlock(&mountlist_mtx); return (NULL); } return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted outside the jail of the calling * thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static u_int16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; int vfslocked; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done, vfslocked; struct proc *p = vnlruproc; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); done += vlrureclaim(mp); VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (done == 0) { #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) destroy_vpollinfo(vp->v_pollinfo); #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp = NULL; struct bufobj *bo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); goto alloc; } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } alloc: numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF); bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; /* XXX non mp-safe fs may still call insmntque with vnode unlocked */ if (!VOP_ISLOCKED(vp)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { int locked; KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); #ifdef DEBUG_VFS_LOCKS if (!VFS_NEEDSGIANT(mp)) ASSERT_VOP_ELOCKED(vp, "insmntque: mp-safe fs and non-locked vp"); #endif MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) { locked = VOP_ISLOCKED(vp); if (!locked || (locked == LK_EXCLUSIVE && (vp->v_vflag & VV_FORCEINSMQ) == 0)) { MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, vp, cred, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: bo = &vp->v_bufobj; BO_LOCK(bo); anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { BO_UNLOCK(bo); goto restart; } } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { BO_UNLOCK(bo); goto restart; } } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); BO_LOCK(bo); bremfree(bp); BO_UNLOCK(bo); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_LOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT) bp->b_flags |= B_NEEDSGIANT; bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int queue, slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : WI_MPSAFEQ; LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = (*bo)->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *gnext, *next; struct synclist *gslp, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && LIST_EMPTY(gslp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } #ifdef SW_WATCHDOG if (first_printf == 0) wdog_kern_pat(WD_LASTVAL); #endif } if (!LIST_EMPTY(gslp)) { mtx_unlock(&sync_mtx); mtx_lock(&Giant); mtx_lock(&sync_mtx); while (!LIST_EMPTY(gslp)) { error = sync_vnode(gslp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(gnext, bo, bo_synclist); continue; } } mtx_unlock(&Giant); } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error; error = 0; VFS_ASSERT_GIANT(vp->v_mount); VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. */ if (vp->v_iflag & VI_OWEINACT) { if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); vp->v_iflag &= ~VI_OWEINACT; } VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } #define VPUTX_VRELE 1 #define VPUTX_VPUT 2 #define VPUTX_VUNREF 3 static void vputx(struct vnode *vp, int func) { int error; KASSERT(vp != NULL, ("vputx: null vp")); if (func == VPUTX_VUNREF) ASSERT_VOP_LOCKED(vp, "vunref"); else if (func == VPUTX_VPUT) ASSERT_VOP_LOCKED(vp, "vput"); else KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); VFS_ASSERT_GIANT(vp->v_mount); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vputx: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { vprint("vputx: negative ref count", vp); panic("vputx: negative ref cnt"); } CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; switch (func) { case VPUTX_VRELE: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; case VPUTX_VPUT: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VPUTX_VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) error = EBUSY; break; } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (error == 0) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, curthread); if (func != VPUTX_VUNREF) VOP_UNLOCK(vp, 0); } vdropl(vp); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { vputx(vp, VPUTX_VRELE); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { vputx(vp, VPUTX_VPUT); } /* * Release an exclusively locked vnode. Do not unlock the vnode lock. */ void vunref(struct vnode *vp) { vputx(vp, VPUTX_VUNREF); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vdropl"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush( struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_ELOCKED(vp, "vrecycle"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) vinvalbuf(vp, 0, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ lf_purgelocks(vp, &(vp->v_lockf)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; flags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOINSMNTQ); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_MPSAFE); MNT_KERN_FLAG(MNTK_NOKNOTE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_noasync = %u\n", mp->mnt_noasync); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } static void destroy_vpollinfo(struct vpollinfo *vi) { + seldrain(&vi->vpi_selinfo); knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); BO_UNLOCK(bo); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } BO_UNLOCK(bo); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (panicstr == NULL && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_data != 0); VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting * va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct mount *mp; mp = vp->v_mount; VFS_ASSERT_GIANT(mp); ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) (void)VOP_MARKATIME(vp); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } Index: stable/8/sys/net/bpf.c =================================================================== --- stable/8/sys/net/bpf.c (revision 225584) +++ stable/8/sys/net/bpf.c (revision 225585) @@ -1,2396 +1,2396 @@ /*- * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_compat.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1)) /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B',109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B',110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B',121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B',123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B',130, struct bpf_program32) #endif /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ static LIST_HEAD(, bpf_if) bpf_iflist; static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: d->bd_zcopy++; return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: d->bd_zcopy++; return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len - hlen > ifp->if_mtu) return (EMSGSIZE); if ((unsigned)len > MJUM16BYTES) return (EIO); if (len <= MHLEN) MGETHDR(m, M_WAIT, MT_DATA); else if (len <= MCLBYTES) m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR); else m = m_getjcl(M_WAIT, MT_DATA, M_PKTHDR, #if (MJUMPAGESIZE > MCLBYTES) len <= MJUMPAGESIZE ? MJUMPAGESIZE : #endif (len <= MJUM9BYTES ? MJUM9BYTES : MJUM16BYTES)); m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; *mp = m; if (m->m_len < hlen) { error = EPERM; goto bad; } error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(m->m_data, sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach file to the bpf interface, i.e. make d listen on bp. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so * it will divert packets to bpf. */ BPFIF_LOCK(bp); d->bd_bif = bp; LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); bpf_bpfd_cnt++; BPFIF_UNLOCK(bp); EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { int error; struct bpf_if *bp; struct ifnet *ifp; bp = d->bd_bif; BPFIF_LOCK(bp); BPFD_LOCK(d); ifp = d->bd_bif->bif_ifp; /* * Remove d from the interface's descriptor list. */ LIST_REMOVE(d, bd_next); bpf_bpfd_cnt--; d->bd_bif = NULL; BPFD_UNLOCK(d); BPFIF_UNLOCK(bp); EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ if (d->bd_promisc) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); mtx_lock(&bpf_mtx); if (d->bd_bif) bpf_detachd(d); mtx_unlock(&bpf_mtx); - selwakeuppri(&d->bd_sel, PRINET); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ + seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpf_freed(d); free(d, M_BPF); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_mtx, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_mtx); return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_mtx, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * XXXRW: More synchronization needed here: what if a second thread * issues a read on the same fd at the same time? Don't want this * getting invalidated. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; struct sockaddr dst; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); d->bd_pid = curthread->td_proc->p_pid; d->bd_wcount++; if (d->bd_bif == NULL) { d->bd_wdcount++; return (ENXIO); } ifp = d->bd_bif->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { d->bd_wdcount++; return (ENETDOWN); } if (uio->uio_resid == 0) { d->bd_wdcount++; return (0); } bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); if (error) { d->bd_wdcount++; return (error); } d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_DONTWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC BPFD_LOCK(d); mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); BPFD_UNLOCK(d); #endif error = (*ifp->if_output)(ifp, m, &dst, NULL); if (error) d->bd_wdcount++; if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } CURVNET_RESTORE(); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { mtx_assert(&d->bd_mtx, MA_OWNED); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; d->bd_wcount = 0; d->bd_wfcount = 0; d->bd_wdcount = 0; d->bd_zcopy = 0; } /* * FIONREAD Check for read packet available. * SIOCGIFADDR Get interface address - convenient hook to driver. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); d->bd_pid = td->td_proc->p_pid; if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #ifdef COMPAT_FREEBSD32 case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #ifdef COMPAT_FREEBSD32 case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: d->bd_compat32 = 1; } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } case SIOCGIFADDR: { struct ifnet *ifp; if (d->bd_bif == NULL) error = EINVAL; else { ifp = d->bd_bif->bif_ifp; error = (*ifp->if_ioctl)(ifp, cmd, addr); } break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: *(u_int *)addr = d->bd_bufsize; break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } break; /* * Get current data link type. */ case BIOCGDLT: if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } break; } #endif case BIOCGDLTLIST: if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); break; /* * Set data link type. */ case BIOCSDLT: if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); break; /* * Get interface name. */ case BIOCGETIF: if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } break; /* * Set interface. */ case BIOCSETIF: error = bpf_setif(d, (struct ifreq *)addr); break; /* * Set read timeout. */ case BIOCSRTIMEOUT: #ifdef COMPAT_FREEBSD32 case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #ifdef COMPAT_FREEBSD32 struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #ifdef COMPAT_FREEBSD32 case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #ifdef COMPAT_FREEBSD32 struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #ifdef COMPAT_FREEBSD32 if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: d->bd_immediate = *(u_int *)addr; break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: *(u_int *)addr = d->bd_hdrcmplt; break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; break; /* * Get packet direction flag */ case BIOCGDIRECTION: *(u_int *)addr = d->bd_direction; break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: d->bd_direction = direction; break; default: error = EINVAL; } } break; case BIOCFEEDBACK: d->bd_feedback = *(u_int *)addr; break; case BIOCLOCK: d->bd_locked = 1; break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ d->bd_async = *(int *)addr; break; case FIOSETOWN: error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: *(int *)addr = fgetown(&d->bd_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else d->bd_sig = sig; break; } case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; case BIOCGETBUFMODE: *(u_int *)addr = d->bd_bufmode; break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { struct bpf_insn *fcode, *old; u_int wfilter, flen, size; #ifdef BPF_JITTER bpf_jit_filter *ofunc; #endif #ifdef COMPAT_FREEBSD32 struct bpf_program32 *fp32; struct bpf_program fp_swab; if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) { fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; if (cmd == BIOCSETWF32) cmd = BIOCSETWF; } #endif if (cmd == BIOCSETWF) { old = d->bd_wfilter; wfilter = 1; #ifdef BPF_JITTER ofunc = NULL; #endif } else { wfilter = 0; old = d->bd_rfilter; #ifdef BPF_JITTER ofunc = d->bd_bfilter; #endif } if (fp->bf_insns == NULL) { if (fp->bf_len != 0) return (EINVAL); BPFD_LOCK(d); if (wfilter) d->bd_wfilter = NULL; else { d->bd_rfilter = NULL; #ifdef BPF_JITTER d->bd_bfilter = NULL; #endif if (cmd == BIOCSETF) reset_d(d); } BPFD_UNLOCK(d); if (old != NULL) free((caddr_t)old, M_BPF); #ifdef BPF_JITTER if (ofunc != NULL) bpf_destroy_jit_filter(ofunc); #endif return (0); } flen = fp->bf_len; if (flen > bpf_maxinsns) return (EINVAL); size = flen * sizeof(*fp->bf_insns); fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK); if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && bpf_validate(fcode, (int)flen)) { BPFD_LOCK(d); if (wfilter) d->bd_wfilter = fcode; else { d->bd_rfilter = fcode; #ifdef BPF_JITTER d->bd_bfilter = bpf_jitter(fcode, flen); #endif if (cmd == BIOCSETF) reset_d(d); } BPFD_UNLOCK(d); if (old != NULL) free((caddr_t)old, M_BPF); #ifdef BPF_JITTER if (ofunc != NULL) bpf_destroy_jit_filter(ofunc); #endif return (0); } free((caddr_t)fcode, M_BPF); return (EINVAL); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * Behavior here depends on the buffering model. If we're using * kernel memory buffers, then we can allocate them here. If we're * using zero-copy, then the user process must have registered * buffers by the time we get here. If not, return an error. * * XXXRW: There are locking issues here with multi-threaded use: what * if two threads try to set the interface at once? */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: if (d->bd_sbuf == NULL) bpf_buffer_alloc(d); KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); break; case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) { if (d->bd_bif) /* * Detach if attached to something else. */ bpf_detachd(d); bpf_attachd(d, bp); } BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); d->bd_pid = td->td_proc->p_pid; if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0 || kn->kn_filter != EVFILT_READ) return (1); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); d->bd_pid = curthread->td_proc->p_pid; kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; if (d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; struct timeval tv; gottime = 0; BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { BPFD_LOCK(d); ++d->bd_rcount; /* * NB: We dont call BPF_CHECK_DIRECTION() here since there is no * way for the caller to indiciate to us whether this packet * is inbound or outbound. In the bpf_mtap() routines, we use * the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; struct timeval tv; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } gottime = 0; pktlen = m_length(m, NULL); BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; BPFD_LOCK(d); ++d->bd_rcount; #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; struct timeval tv; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } gottime = 0; pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; BPFIF_LOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; BPFD_LOCK(d); ++d->bd_rcount; slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { d->bd_fcount++; if (!gottime) { microtime(&tv); gottime = 1; } #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } #undef BPF_CHECK_DIRECTION /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *tv) { struct bpf_hdr hdr; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32; #endif int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); ++d->bd_dcount; return; } ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* * Immediate mode is set, or the read timeout has already * expired during a select call. A packet arrived, so the * reader should be woken up. */ do_wakeup = 1; #ifdef COMPAT_FREEBSD32 /* * If this is a 32-bit stream, then stick a 32-bit header at the * front and copy the data into the buffer. */ if (d->bd_compat32) { bzero(&hdr32, sizeof(hdr32)); hdr32.bh_tstamp.tv_sec = tv->tv_sec; hdr32.bh_tstamp.tv_usec = tv->tv_usec; hdr32.bh_datalen = pktlen; hdr32.bh_hdrlen = hdrlen; hdr.bh_caplen = hdr32.bh_caplen = totlen - hdrlen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32, sizeof(hdr32)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); hdr.bh_tstamp = *tv; hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = totlen - hdrlen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifdef COMPAT_FREEBSD32 copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ bpf_free(d); if (d->bd_rfilter != NULL) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER if (d->bd_bfilter != NULL) bpf_destroy_jit_filter(d->bd_bfilter); #endif } if (d->bd_wfilter != NULL) free((caddr_t)d->bd_wfilter, M_BPF); mtx_destroy(&d->bd_mtx); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); if (bp == NULL) panic("bpfattach"); LIST_INIT(&bp->bif_dlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF); KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); *driverp = bp; mtx_lock(&bpf_mtx); LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); mtx_unlock(&bpf_mtx); /* * Compute the length of the bpf header. This is not necessarily * equal to SIZEOF_BPF_HDR because we want to insert spacing such * that the network layer header begins on a longword boundary (for * performance reasons and to alleviate alignment restrictions). */ bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; if (bootverbose) if_printf(ifp, "bpf attached\n"); } /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface, and leaving bd_bif NULL. Notify each * descriptor as it's detached so that any sleepers wake up and get * ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp; struct bpf_d *d; /* Locate BPF interface information */ mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (ifp == bp->bif_ifp) break; } /* Interface wasn't attached */ if ((bp == NULL) || (bp->bif_ifp == NULL)) { mtx_unlock(&bpf_mtx); printf("bpfdetach: %s was not attached\n", ifp->if_xname); return; } LIST_REMOVE(bp, bif_next); mtx_unlock(&bpf_mtx); while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd(d); BPFD_LOCK(d); bpf_wakeup(d); BPFD_UNLOCK(d); } mtx_destroy(&bp->bif_mtx); free(bp, M_BPF); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { int n, error; struct ifnet *ifp; struct bpf_if *bp; ifp = d->bd_bif->bif_ifp; n = 0; error = 0; mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { if (n >= bfl->bfl_len) { mtx_unlock(&bpf_mtx); return (ENOMEM); } error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); } n++; } mtx_unlock(&bpf_mtx); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } mtx_unlock(&bpf_mtx); if (bp != NULL) { opromisc = d->bd_promisc; bpf_detachd(d); bpf_attachd(d, bp); BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "bpf_setdlt: ifpromisc failed (%d)\n", error); else d->bd_promisc = 1; } } return (bp == NULL ? EINVAL : 0); } static void bpf_drvinit(void *unused) { struct cdev *dev; mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF); LIST_INIT(&bpf_iflist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { BPFIF_LOCK(bp); LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { BPFD_LOCK(bd); bd->bd_rcount = 0; bd->bd_dcount = 0; bd->bd_fcount = 0; bd->bd_wcount = 0; bd->bd_wfcount = 0; bd->bd_zcopy = 0; BPFD_UNLOCK(bd); } BPFIF_UNLOCK(bp); } mtx_unlock(&bpf_mtx); } static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { bzero(d, sizeof(*d)); BPFD_LOCK_ASSERT(bd); d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = bd->bd_rcount; d->bd_dcount = bd->bd_dcount; d->bd_fcount = bd->bd_fcount; d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = bd->bd_wcount; d->bd_wdcount = bd->bd_wdcount; d->bd_wfcount = bd->bd_wfcount; d->bd_zcopy = bd->bd_zcopy; d->bd_bufmode = bd->bd_bufmode; } static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { struct xbpf_d *xbdbuf, *xbd, zerostats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(zerostats)) return (EINVAL); bzero(&zerostats, sizeof(zerostats)); xbd = req->newptr; if (bcmp(xbd, &zerostats, sizeof(*xbd)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); mtx_lock(&bpf_mtx); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { mtx_unlock(&bpf_mtx); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; LIST_FOREACH(bp, &bpf_iflist, bif_next) { BPFIF_LOCK(bp); LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); BPFD_UNLOCK(bd); } BPFIF_UNLOCK(bp); } mtx_unlock(&bpf_mtx); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ static struct bpf_if bp_null; void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = &bp_null; } void bpfdetach(struct ifnet *ifp) { } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ Index: stable/8/sys/net/if_tap.c =================================================================== --- stable/8/sys/net/if_tap.c (revision 225584) +++ stable/8/sys/net/if_tap.c (revision 225585) @@ -1,1084 +1,1085 @@ /*- * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. */ /* * $FreeBSD$ * $Id: if_tap.c,v 0.21 2000/07/23 21:46:02 max Exp $ */ #include "opt_compat.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CDEV_NAME "tap" #define TAPDEBUG if (tapdebug) printf #define TAP "tap" #define VMNET "vmnet" #define TAPMAXUNIT 0x7fff #define VMNET_DEV_MASK CLONE_FLAG0 /* module */ static int tapmodevent(module_t, int, void *); /* device */ static void tapclone(void *, struct ucred *, char *, int, struct cdev **); static void tapcreate(struct cdev *); /* network interface */ static void tapifstart(struct ifnet *); static int tapifioctl(struct ifnet *, u_long, caddr_t); static void tapifinit(void *); static int tap_clone_create(struct if_clone *, int, caddr_t); static void tap_clone_destroy(struct ifnet *); static int vmnet_clone_create(struct if_clone *, int, caddr_t); static void vmnet_clone_destroy(struct ifnet *); IFC_SIMPLE_DECLARE(tap, 0); IFC_SIMPLE_DECLARE(vmnet, 0); /* character device */ static d_open_t tapopen; static d_close_t tapclose; static d_read_t tapread; static d_write_t tapwrite; static d_ioctl_t tapioctl; static d_poll_t tappoll; static d_kqfilter_t tapkqfilter; /* kqueue(2) */ static int tapkqread(struct knote *, long); static int tapkqwrite(struct knote *, long); static void tapkqdetach(struct knote *); static struct filterops tap_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tapkqdetach, .f_event = tapkqread, }; static struct filterops tap_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tapkqdetach, .f_event = tapkqwrite, }; static struct cdevsw tap_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDMINOR, .d_open = tapopen, .d_close = tapclose, .d_read = tapread, .d_write = tapwrite, .d_ioctl = tapioctl, .d_poll = tappoll, .d_name = CDEV_NAME, .d_kqfilter = tapkqfilter, }; /* * All global variables in if_tap.c are locked with tapmtx, with the * exception of tapdebug, which is accessed unlocked; tapclones is * static at runtime. */ static struct mtx tapmtx; static int tapdebug = 0; /* debug flag */ static int tapuopen = 0; /* allow user open() */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static SLIST_HEAD(, tap_softc) taphead; /* first device */ static struct clonedevs *tapclones; MALLOC_DECLARE(M_TAP); MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface"); SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, ""); SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tapdclone, 0, "Enably legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); TUNABLE_INT("net.link.tap.devfs_cloning", &tapdclone); DEV_MODULE(if_tap, tapmodevent, NULL); static int tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct cdev *dev; int i; int extra; if (strcmp(ifc->ifc_name, VMNET) == 0) extra = VMNET_DEV_MASK; else extra = 0; /* find any existing device, or allocate new unit number */ i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, extra); if (i) { dev = make_dev(&tap_cdevsw, unit | extra, UID_ROOT, GID_WHEEL, 0600, "%s%d", ifc->ifc_name, unit); } tapcreate(dev); return (0); } /* vmnet devices are tap devices in disguise */ static int vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) { return tap_clone_create(ifc, unit, params); } static void tap_destroy(struct tap_softc *tp) { struct ifnet *ifp = tp->tap_ifp; /* Unlocked read. */ KASSERT(!(tp->tap_flags & TAP_OPEN), ("%s flags is out of sync", ifp->if_xname)); + seldrain(&tp->tap_rsel); knlist_destroy(&tp->tap_rsel.si_note); destroy_dev(tp->tap_dev); ether_ifdetach(ifp); if_free_type(ifp, IFT_ETHER); mtx_destroy(&tp->tap_mtx); free(tp, M_TAP); } static void tap_clone_destroy(struct ifnet *ifp) { struct tap_softc *tp = ifp->if_softc; mtx_lock(&tapmtx); SLIST_REMOVE(&taphead, tp, tap_softc, tap_next); mtx_unlock(&tapmtx); tap_destroy(tp); } /* vmnet devices are tap devices in disguise */ static void vmnet_clone_destroy(struct ifnet *ifp) { tap_clone_destroy(ifp); } /* * tapmodevent * * module event handler */ static int tapmodevent(module_t mod, int type, void *data) { static eventhandler_tag eh_tag = NULL; struct tap_softc *tp = NULL; struct ifnet *ifp = NULL; switch (type) { case MOD_LOAD: /* intitialize device */ mtx_init(&tapmtx, "tapmtx", NULL, MTX_DEF); SLIST_INIT(&taphead); clone_setup(&tapclones); eh_tag = EVENTHANDLER_REGISTER(dev_clone, tapclone, 0, 1000); if (eh_tag == NULL) { clone_cleanup(&tapclones); mtx_destroy(&tapmtx); return (ENOMEM); } if_clone_attach(&tap_cloner); if_clone_attach(&vmnet_cloner); return (0); case MOD_UNLOAD: /* * The EBUSY algorithm here can't quite atomically * guarantee that this is race-free since we have to * release the tap mtx to deregister the clone handler. */ mtx_lock(&tapmtx); SLIST_FOREACH(tp, &taphead, tap_next) { mtx_lock(&tp->tap_mtx); if (tp->tap_flags & TAP_OPEN) { mtx_unlock(&tp->tap_mtx); mtx_unlock(&tapmtx); return (EBUSY); } mtx_unlock(&tp->tap_mtx); } mtx_unlock(&tapmtx); EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); if_clone_detach(&tap_cloner); if_clone_detach(&vmnet_cloner); drain_dev_clone_events(); mtx_lock(&tapmtx); while ((tp = SLIST_FIRST(&taphead)) != NULL) { SLIST_REMOVE_HEAD(&taphead, tap_next); mtx_unlock(&tapmtx); ifp = tp->tap_ifp; TAPDEBUG("detaching %s\n", ifp->if_xname); tap_destroy(tp); mtx_lock(&tapmtx); } mtx_unlock(&tapmtx); clone_cleanup(&tapclones); mtx_destroy(&tapmtx); break; default: return (EOPNOTSUPP); } return (0); } /* tapmodevent */ /* * DEVFS handler * * We need to support two kind of devices - tap and vmnet */ static void tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; int i, unit, append_unit; int extra; if (*dev != NULL) return; if (!tapdclone || (!tapuopen && priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0)) return; unit = 0; append_unit = 0; extra = 0; /* We're interested in only tap/vmnet devices. */ if (strcmp(name, TAP) == 0) { unit = -1; } else if (strcmp(name, VMNET) == 0) { unit = -1; extra = VMNET_DEV_MASK; } else if (dev_stdclone(name, NULL, TAP, &unit) != 1) { if (dev_stdclone(name, NULL, VMNET, &unit) != 1) { return; } else { extra = VMNET_DEV_MASK; } } if (unit == -1) append_unit = 1; /* find any existing device, or allocate new unit number */ i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); if (i) { if (append_unit) { /* * We were passed 'tun' or 'tap', with no unit specified * so we'll need to append it now. */ namelen = snprintf(devname, sizeof(devname), "%s%d", name, unit); name = devname; } *dev = make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra, cred, UID_ROOT, GID_WHEEL, 0600, "%s", name); } if_clone_create(name, namelen, NULL); } /* tapclone */ /* * tapcreate * * to create interface */ static void tapcreate(struct cdev *dev) { struct ifnet *ifp = NULL; struct tap_softc *tp = NULL; unsigned short macaddr_hi; uint32_t macaddr_mid; int unit; char *name = NULL; u_char eaddr[6]; dev->si_flags &= ~SI_CHEAPCLONE; /* allocate driver storage and create device */ tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); mtx_lock(&tapmtx); SLIST_INSERT_HEAD(&taphead, tp, tap_next); mtx_unlock(&tapmtx); unit = dev2unit(dev); /* select device: tap or vmnet */ if (unit & VMNET_DEV_MASK) { name = VMNET; tp->tap_flags |= TAP_VMNET; } else name = TAP; unit &= TAPMAXUNIT; TAPDEBUG("tapcreate(%s%d). minor = %#x\n", name, unit, dev2unit(dev)); /* generate fake MAC address: 00 bd xx xx xx unit_no */ macaddr_hi = htons(0x00bd); macaddr_mid = (uint32_t) ticks; bcopy(&macaddr_hi, eaddr, sizeof(short)); bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); eaddr[5] = (u_char)unit; /* fill the rest and attach interface */ ifp = tp->tap_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s%d: can not if_alloc()", name, unit); ifp->if_softc = tp; if_initname(ifp, name, unit); ifp->if_init = tapifinit; ifp->if_start = tapifstart; ifp->if_ioctl = tapifioctl; ifp->if_mtu = ETHERMTU; ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; dev->si_drv1 = tp; tp->tap_dev = dev; ether_ifattach(ifp, eaddr); mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_INITED; mtx_unlock(&tp->tap_mtx); knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); TAPDEBUG("interface %s is created. minor = %#x\n", ifp->if_xname, dev2unit(dev)); } /* tapcreate */ /* * tapopen * * to open tunnel. must be superuser */ static int tapopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct tap_softc *tp = NULL; struct ifnet *ifp = NULL; int error; if (tapuopen == 0) { error = priv_check(td, PRIV_NET_TAP); if (error) return (error); } if ((dev2unit(dev) & CLONE_UNITMASK) > TAPMAXUNIT) return (ENXIO); tp = dev->si_drv1; mtx_lock(&tp->tap_mtx); if (tp->tap_flags & TAP_OPEN) { mtx_unlock(&tp->tap_mtx); return (EBUSY); } bcopy(IF_LLADDR(tp->tap_ifp), tp->ether_addr, sizeof(tp->ether_addr)); tp->tap_pid = td->td_proc->p_pid; tp->tap_flags |= TAP_OPEN; ifp = tp->tap_ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; if_link_state_change(ifp, LINK_STATE_UP); mtx_unlock(&tp->tap_mtx); TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); return (0); } /* tapopen */ /* * tapclose * * close the device - mark i/f down & delete routing info */ static int tapclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct ifaddr *ifa; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; /* junk all pending output */ mtx_lock(&tp->tap_mtx); IF_DRAIN(&ifp->if_snd); /* * do not bring the interface down, and do not anything with * interface, if we are in VMnet mode. just close the device. */ if (((tp->tap_flags & TAP_VMNET) == 0) && (ifp->if_flags & IFF_UP)) { mtx_unlock(&tp->tap_mtx); if_down(ifp); mtx_lock(&tp->tap_mtx); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; mtx_unlock(&tp->tap_mtx); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { rtinit(ifa, (int)RTM_DELETE, 0); } if_purgeaddrs(ifp); mtx_lock(&tp->tap_mtx); } } if_link_state_change(ifp, LINK_STATE_DOWN); funsetown(&tp->tap_sigio); selwakeuppri(&tp->tap_rsel, PZERO+1); KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); tp->tap_flags &= ~TAP_OPEN; tp->tap_pid = 0; mtx_unlock(&tp->tap_mtx); TAPDEBUG("%s is closed. minor = %#x\n", ifp->if_xname, dev2unit(dev)); return (0); } /* tapclose */ /* * tapifinit * * network interface initialization function */ static void tapifinit(void *xtp) { struct tap_softc *tp = (struct tap_softc *)xtp; struct ifnet *ifp = tp->tap_ifp; TAPDEBUG("initializing %s\n", ifp->if_xname); mtx_lock(&tp->tap_mtx); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; mtx_unlock(&tp->tap_mtx); /* attempt to start output */ tapifstart(ifp); } /* tapifinit */ /* * tapifioctl * * Process an ioctl request on network interface */ static int tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct tap_softc *tp = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct ifstat *ifs = NULL; int dummy; switch (cmd) { case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; case SIOCGIFSTATUS: ifs = (struct ifstat *)data; dummy = strlen(ifs->ascii); mtx_lock(&tp->tap_mtx); if (tp->tap_pid != 0 && dummy < sizeof(ifs->ascii)) snprintf(ifs->ascii + dummy, sizeof(ifs->ascii) - dummy, "\tOpened by PID %d\n", tp->tap_pid); mtx_unlock(&tp->tap_mtx); break; default: return (ether_ioctl(ifp, cmd, data)); /* NOT REACHED */ } return (0); } /* tapifioctl */ /* * tapifstart * * queue packets from higher level ready to put out */ static void tapifstart(struct ifnet *ifp) { struct tap_softc *tp = ifp->if_softc; TAPDEBUG("%s starting\n", ifp->if_xname); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ mtx_lock(&tp->tap_mtx); if (((tp->tap_flags & TAP_VMNET) == 0) && ((tp->tap_flags & TAP_READY) != TAP_READY)) { struct mbuf *m; /* Unlocked read. */ TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, tp->tap_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); ifp->if_oerrors++; } else break; } mtx_unlock(&tp->tap_mtx); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tap_flags & TAP_RWAIT) { tp->tap_flags &= ~TAP_RWAIT; wakeup(tp); } if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { mtx_unlock(&tp->tap_mtx); pgsigio(&tp->tap_sigio, SIGIO, 0); mtx_lock(&tp->tap_mtx); } selwakeuppri(&tp->tap_rsel, PZERO+1); KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); ifp->if_opackets ++; /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; mtx_unlock(&tp->tap_mtx); } /* tapifstart */ /* * tapioctl * * the cdevsw interface is now pretty minimal */ static int tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct tapinfo *tapp = NULL; int f; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; #endif switch (cmd) { case TAPSIFINFO: tapp = (struct tapinfo *)data; mtx_lock(&tp->tap_mtx); ifp->if_mtu = tapp->mtu; ifp->if_type = tapp->type; ifp->if_baudrate = tapp->baudrate; mtx_unlock(&tp->tap_mtx); break; case TAPGIFINFO: tapp = (struct tapinfo *)data; mtx_lock(&tp->tap_mtx); tapp->mtu = ifp->if_mtu; tapp->type = ifp->if_type; tapp->baudrate = ifp->if_baudrate; mtx_unlock(&tp->tap_mtx); break; case TAPSDEBUG: tapdebug = *(int *)data; break; case TAPGDEBUG: *(int *)data = tapdebug; break; case TAPGIFNAME: { struct ifreq *ifr = (struct ifreq *) data; strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); } break; case FIONBIO: break; case FIOASYNC: mtx_lock(&tp->tap_mtx); if (*(int *)data) tp->tap_flags |= TAP_ASYNC; else tp->tap_flags &= ~TAP_ASYNC; mtx_unlock(&tp->tap_mtx); break; case FIONREAD: if (!IFQ_IS_EMPTY(&ifp->if_snd)) { struct mbuf *mb; IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&ifp->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tap_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tap_sigio); return (0); /* this is deprecated, FIOSETOWN should be used instead */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tap_sigio)); /* this is deprecated, FIOGETOWN should be used instead */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tap_sigio); return (0); /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ f = *(int *)data; f &= 0x0fff; f &= ~IFF_CANTCHANGE; f |= IFF_UP; mtx_lock(&tp->tap_mtx); ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); mtx_unlock(&tp->tap_mtx); break; case OSIOCGIFADDR: /* get MAC address of the remote side */ case SIOCGIFADDR: mtx_lock(&tp->tap_mtx); bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); break; case SIOCSIFADDR: /* set MAC address of the remote side */ mtx_lock(&tp->tap_mtx); bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); break; default: return (ENOTTY); } return (0); } /* tapioctl */ /* * tapread * * the cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read */ static int tapread(struct cdev *dev, struct uio *uio, int flag) { struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct mbuf *m = NULL; int error = 0, len; TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); mtx_lock(&tp->tap_mtx); if ((tp->tap_flags & TAP_READY) != TAP_READY) { mtx_unlock(&tp->tap_mtx); /* Unlocked read. */ TAPDEBUG("%s not ready. minor = %#x, tap_flags = 0x%x\n", ifp->if_xname, dev2unit(dev), tp->tap_flags); return (EHOSTDOWN); } tp->tap_flags &= ~TAP_RWAIT; /* sleep until we get a packet */ do { IF_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { mtx_unlock(&tp->tap_mtx); return (EWOULDBLOCK); } tp->tap_flags |= TAP_RWAIT; error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), "taprd", 0); if (error) { mtx_unlock(&tp->tap_mtx); return (error); } } } while (m == NULL); mtx_unlock(&tp->tap_mtx); /* feed packet to bpf */ BPF_MTAP(ifp, m); /* xfer packet to user space */ while ((m != NULL) && (uio->uio_resid > 0) && (error == 0)) { len = min(uio->uio_resid, m->m_len); if (len == 0) break; error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m != NULL) { TAPDEBUG("%s dropping mbuf, minor = %#x\n", ifp->if_xname, dev2unit(dev)); m_freem(m); } return (error); } /* tapread */ /* * tapwrite * * the cdevsw write interface - an atomic write is a packet - or else! */ static int tapwrite(struct cdev *dev, struct uio *uio, int flag) { struct ether_header *eh; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct mbuf *m; TAPDEBUG("%s writting, minor = %#x\n", ifp->if_xname, dev2unit(dev)); if (uio->uio_resid == 0) return (0); if ((uio->uio_resid < 0) || (uio->uio_resid > TAPMRU)) { TAPDEBUG("%s invalid packet len = %zd, minor = %#x\n", ifp->if_xname, uio->uio_resid, dev2unit(dev)); return (EIO); } if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, ETHER_ALIGN, M_PKTHDR)) == NULL) { ifp->if_ierrors ++; return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; /* * Only pass a unicast frame to ether_input(), if it would actually * have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } /* Pass packet up to parent. */ (*ifp->if_input)(ifp, m); ifp->if_ipackets ++; /* ibytes are counted in parent */ return (0); } /* tapwrite */ /* * tappoll * * the poll interface, this is only useful on reads * really. the write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it */ static int tappoll(struct cdev *dev, int events, struct thread *td) { struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; int revents = 0; TAPDEBUG("%s polling, minor = %#x\n", ifp->if_xname, dev2unit(dev)); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TAPDEBUG("%s have data in queue. len = %d, " \ "minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); revents |= (events & (POLLIN | POLLRDNORM)); } else { TAPDEBUG("%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); selrecord(td, &tp->tap_rsel); } IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= (events & (POLLOUT | POLLWRNORM)); return (revents); } /* tappoll */ /* * tap_kqfilter * * support for kevent() system call */ static int tapkqfilter(struct cdev *dev, struct knote *kn) { struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; switch (kn->kn_filter) { case EVFILT_READ: TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tap_read_filterops; break; case EVFILT_WRITE: TAPDEBUG("%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tap_write_filterops; break; default: TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return (EINVAL); /* NOT REACHED */ } kn->kn_hook = tp; knlist_add(&tp->tap_rsel.si_note, kn, 0); return (0); } /* tapkqfilter */ /* * tap_kqread * * Return true if there is data in the interface queue */ static int tapkqread(struct knote *kn, long hint) { int ret; struct tap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tap_dev; struct ifnet *ifp = tp->tap_ifp; if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TAPDEBUG("%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* tapkqread */ /* * tap_kqwrite * * Always can write. Return the MTU in kn->data */ static int tapkqwrite(struct knote *kn, long hint) { struct tap_softc *tp = kn->kn_hook; struct ifnet *ifp = tp->tap_ifp; kn->kn_data = ifp->if_mtu; return (1); } /* tapkqwrite */ static void tapkqdetach(struct knote *kn) { struct tap_softc *tp = kn->kn_hook; knlist_remove(&tp->tap_rsel.si_note, kn, 0); } /* tapkqdetach */ Index: stable/8/sys/net/if_tun.c =================================================================== --- stable/8/sys/net/if_tun.c (revision 225584) +++ stable/8/sys/net/if_tun.c (revision 225585) @@ -1,1050 +1,1051 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #include #include /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tun_softc { TAILQ_ENTRY(tun_softc) tun_list; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_READY (TUN_OPEN | TUN_INITED) /* * XXXRW: tun_pid is used to exclusively lock /dev/tun. Is this * actually needed? Can we just return EBUSY if already open? * Problem is that this involved inherent races when a tun device * is handed off from one process to another, as opposed to just * being slightly stale informationally. */ pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* information for async I/O */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* protect mutable softc fields */ struct cv tun_cv; /* protect against ref'd dev destroy */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUNNAME "tun" /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and tunclones, * which is static after setup. */ static struct mtx tunmtx; static MALLOC_DEFINE(M_TUN, TUNNAME, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static struct clonedevs *tunclones; static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface."); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tundclone, 0, "Enable legacy devfs interface creation."); TUNABLE_INT("net.link.tun.devfs_cloning", &tundclone); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(const char *name, struct cdev *dev); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static int tunmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static int tun_clone_create(struct if_clone *, int, caddr_t); static void tun_clone_destroy(struct ifnet *); IFC_SIMPLE_DECLARE(tun, 0); static d_open_t tunopen; static d_close_t tunclose; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct cdevsw tun_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = TUNNAME, }; static int tun_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct cdev *dev; int i; /* find any existing device, or allocate new unit number */ i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0); if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&tun_cdevsw, unit, UID_UUCP, GID_DIALER, 0600, "%s%d", ifc->ifc_name, unit); } tuncreate(ifc->ifc_name, dev); return (0); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; int u, i, append_unit; if (*dev != NULL) return; /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0) return; if (strcmp(name, TUNNAME) == 0) { u = -1; } else if (dev_stdclone(name, NULL, TUNNAME, &u) != 1) return; /* Don't recognise the name */ if (u != -1 && u > IF_MAXUNIT) return; /* Unit number too high */ if (u == -1) append_unit = 1; else append_unit = 0; CURVNET_SET(CRED_TO_VNET(cred)); /* find any existing device, or allocate new unit number */ i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } /* No preexisting struct cdev *, create one */ *dev = make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred, UID_UUCP, GID_DIALER, 0600, "%s", name); } if_clone_create(name, namelen, NULL); CURVNET_RESTORE(); } static void tun_destroy(struct tun_softc *tp) { struct cdev *dev; /* Unlocked read. */ mtx_lock(&tp->tun_mtx); if ((tp->tun_flags & TUN_OPEN) != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else mtx_unlock(&tp->tun_mtx); CURVNET_SET(TUN2IFP(tp)->if_vnet); dev = tp->tun_dev; bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); if_free(TUN2IFP(tp)); destroy_dev(dev); + seldrain(&tp->tun_rsel); knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static void tun_clone_destroy(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); } static int tunmodevent(module_t mod, int type, void *data) { static eventhandler_tag tag; struct tun_softc *tp; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); clone_setup(&tunclones); tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (tag == NULL) return (ENOMEM); if_clone_attach(&tun_cloner); break; case MOD_UNLOAD: if_clone_detach(&tun_cloner); EVENTHANDLER_DEREGISTER(dev_clone, tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); clone_cleanup(&tunclones); mtx_destroy(&tunmtx); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tun_mod = { "if_tun", tunmodevent, 0 }; DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static void tunstart(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } mtx_lock(&tp->tun_mtx); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { mtx_unlock(&tp->tun_mtx); pgsigio(&tp->tun_sigio, SIGIO, 0); } else mtx_unlock(&tp->tun_mtx); } /* XXX: should return an error code so it can fail. */ static void tuncreate(const char *name, struct cdev *dev) { struct tun_softc *sc; struct ifnet *ifp; dev->si_flags &= ~SI_CHEAPCLONE; sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); sc->tun_flags = TUN_INITED; sc->tun_dev = dev; mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); mtx_unlock(&tunmtx); ifp = sc->tun_ifp = if_alloc(IFT_PPP); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", name, dev2unit(dev)); if_initname(ifp, name, dev2unit(dev)); ifp->if_mtu = TUNMTU; ifp->if_ioctl = tunifioctl; ifp->if_output = tunoutput; ifp->if_start = tunstart; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_softc = sc; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); dev->si_drv1 = sc; TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tun_softc *tp; /* * XXXRW: Non-atomic test and set of dev->si_drv1 requires * synchronization. */ tp = dev->si_drv1; if (!tp) { tuncreate(TUNNAME, dev); tp = dev->si_drv1; } /* * XXXRW: This use of tun_pid is subject to error due to the * fact that a reference to the tunnel can live beyond the * death of the process that created it. Can we replace this * with a simple busy flag? */ mtx_lock(&tp->tun_mtx); if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) { mtx_unlock(&tp->tun_mtx); return (EBUSY); } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; ifp = TUN2IFP(tp); if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); mtx_unlock(&tp->tun_mtx); return (0); } /* * tunclose - close the device - mark i/f down & delete * routing info */ static int tunclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct tun_softc *tp; struct ifnet *ifp; tp = dev->si_drv1; ifp = TUN2IFP(tp); mtx_lock(&tp->tun_mtx); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); IFQ_PURGE(&ifp->if_snd); if (ifp->if_flags & IFF_UP) { mtx_unlock(&tp->tun_mtx); if_down(ifp); mtx_lock(&tp->tun_mtx); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; mtx_unlock(&tp->tun_mtx); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (ifa->ifa_addr->sa_family == AF_INET) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } else { rtinit(ifa, (int)RTM_DELETE, 0); } } if_purgeaddrs(ifp); mtx_lock(&tp->tun_mtx); } if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); cv_broadcast(&tp->tun_cv); mtx_unlock(&tp->tun_mtx); return (0); } static void tuninit(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; #ifdef INET struct ifaddr *ifa; #endif TUNDEBUG(ifp, "tuninit\n"); mtx_lock(&tp->tun_mtx); ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; getmicrotime(&ifp->if_lastchange); #ifdef INET if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *si; si = (struct sockaddr_in *)ifa->ifa_addr; if (si->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } } if_addr_runlock(ifp); #endif mtx_unlock(&tp->tun_mtx); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tun_softc *tp = ifp->if_softc; struct ifstat *ifs; int error = 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; mtx_lock(&tp->tun_mtx); if (tp->tun_pid) sprintf(ifs->ascii + strlen(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); mtx_unlock(&tp->tun_mtx); break; case SIOCSIFADDR: tuninit(ifp); TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFDSTADDR: tuninit(ifp); TUNDEBUG(ifp, "destination address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; default: error = EINVAL; } return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct route *ro) { struct tun_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ mtx_lock(&tp->tun_mtx); cached_tun_flags = tp->tun_flags; mtx_unlock(&tp->tun_mtx); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) { bcopy(dst->sa_data, &af, sizeof(af)); dst->sa_family = af; } if (bpf_peers_present(ifp->if_bpf)) { af = dst->sa_family; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); } /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_DONTWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { ifp->if_iqdrops++; ifp->if_oerrors++; return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_DONTWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { ifp->if_iqdrops++; ifp->if_oerrors++; return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(dst->sa_family); } else { #ifdef INET if (dst->sa_family != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); ifp->if_opackets++; return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { int error; struct tun_softc *tp = dev->si_drv1; struct tuninfo *tunp; switch (cmd) { case TUNSIFINFO: tunp = (struct tuninfo *)data; if (tunp->mtu < IF_MINMTU) return (EINVAL); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); } mtx_lock(&tp->tun_mtx); TUN2IFP(tp)->if_mtu = tunp->mtu; TUN2IFP(tp)->if_type = tunp->type; TUN2IFP(tp)->if_baudrate = tunp->baudrate; mtx_unlock(&tp->tun_mtx); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; mtx_lock(&tp->tun_mtx); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; mtx_unlock(&tp->tun_mtx); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case TUNSLMODE: mtx_lock(&tp->tun_mtx); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; mtx_unlock(&tp->tun_mtx); break; case TUNSIFHEAD: mtx_lock(&tp->tun_mtx); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; mtx_unlock(&tp->tun_mtx); break; case TUNGIFHEAD: mtx_lock(&tp->tun_mtx); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; mtx_unlock(&tp->tun_mtx); break; case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return(EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: mtx_lock(&tp->tun_mtx); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; mtx_unlock(&tp->tun_mtx); break; default: return(EINVAL); } break; case TUNSIFPID: mtx_lock(&tp->tun_mtx); tp->tun_pid = curthread->td_proc->p_pid; mtx_unlock(&tp->tun_mtx); break; case FIONBIO: break; case FIOASYNC: mtx_lock(&tp->tun_mtx); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; mtx_unlock(&tp->tun_mtx); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; int error=0, len; TUNDEBUG (ifp, "read\n"); mtx_lock(&tp->tun_mtx); if ((tp->tun_flags & TUN_READY) != TUN_READY) { mtx_unlock(&tp->tun_mtx); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; do { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { mtx_unlock(&tp->tun_mtx); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { mtx_unlock(&tp->tun_mtx); return (error); } } } while (m == NULL); mtx_unlock(&tp->tun_mtx); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; uint32_t family; int isr; TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); if (uio->uio_resid < 0 || uio->uio_resid > TUNMRU) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { ifp->if_ierrors++; return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* Could be unlocked read? */ mtx_lock(&tp->tun_mtx); if (tp->tun_flags & TUN_IFHEAD) { mtx_unlock(&tp->tun_mtx); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { mtx_unlock(&tp->tun_mtx); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif #ifdef IPX case AF_IPX: isr = NETISR_IPX; break; #endif #ifdef NETATALK case AF_APPLETALK: isr = NETISR_ATALK2; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } /* First chunk of an mbuf contains good junk */ if (harvest.point_to_point) random_harvest(m, 16, 3, 0, RANDOM_NET); ifp->if_ibytes += m->m_pkthdr.len; ifp->if_ipackets++; CURVNET_SET(ifp->if_vnet); netisr_dispatch(isr, m); CURVNET_RESTORE(); return (0); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; struct mbuf *m; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m != NULL) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tun_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tun_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tun_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } Index: stable/8/sys/security/audit/audit_pipe.c =================================================================== --- stable/8/sys/security/audit/audit_pipe.c (revision 225584) +++ stable/8/sys/security/audit/audit_pipe.c (revision 225585) @@ -1,1128 +1,1129 @@ /*- * Copyright (c) 2006 Robert N. M. Watson * Copyright (c) 2008-2009 Apple, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Implementation of a clonable special device providing a live stream of BSM * audit data. Consumers receive a "tee" of the system audit trail by * default, but may also define alternative event selections using ioctls. * This interface provides unreliable but timely access to audit events. * Consumers should be very careful to avoid introducing event cycles. */ /* * Memory types. */ static MALLOC_DEFINE(M_AUDIT_PIPE, "audit_pipe", "Audit pipes"); static MALLOC_DEFINE(M_AUDIT_PIPE_ENTRY, "audit_pipeent", "Audit pipe entries and buffers"); static MALLOC_DEFINE(M_AUDIT_PIPE_PRESELECT, "audit_pipe_presel", "Audit pipe preselection structure"); /* * Audit pipe buffer parameters. */ #define AUDIT_PIPE_QLIMIT_DEFAULT (128) #define AUDIT_PIPE_QLIMIT_MIN (1) #define AUDIT_PIPE_QLIMIT_MAX (1024) /* * Description of an entry in an audit_pipe. */ struct audit_pipe_entry { void *ape_record; u_int ape_record_len; TAILQ_ENTRY(audit_pipe_entry) ape_queue; }; /* * Audit pipes allow processes to express "interest" in the set of records * that are delivered via the pipe. They do this in a similar manner to the * mechanism for audit trail configuration, by expressing two global masks, * and optionally expressing per-auid masks. The following data structure is * the per-auid mask description. The global state is stored in the audit * pipe data structure. * * We may want to consider a more space/time-efficient data structure once * usage patterns for per-auid specifications are clear. */ struct audit_pipe_preselect { au_id_t app_auid; au_mask_t app_mask; TAILQ_ENTRY(audit_pipe_preselect) app_list; }; /* * Description of an individual audit_pipe. Consists largely of a bounded * length queue. */ #define AUDIT_PIPE_ASYNC 0x00000001 #define AUDIT_PIPE_NBIO 0x00000002 struct audit_pipe { int ap_open; /* Device open? */ u_int ap_flags; struct selinfo ap_selinfo; struct sigio *ap_sigio; /* * Per-pipe mutex protecting most fields in this data structure. */ struct mtx ap_mtx; /* * Per-pipe sleep lock serializing user-generated reads and flushes. * uiomove() is called to copy out the current head record's data * while the record remains in the queue, so we prevent other threads * from removing it using this lock. */ struct sx ap_sx; /* * Condition variable to signal when data has been delivered to a * pipe. */ struct cv ap_cv; /* * Various queue-reated variables: qlen and qlimit are a count of * records in the queue; qbyteslen is the number of bytes of data * across all records, and qoffset is the amount read so far of the * first record in the queue. The number of bytes available for * reading in the queue is qbyteslen - qoffset. */ u_int ap_qlen; u_int ap_qlimit; u_int ap_qbyteslen; u_int ap_qoffset; /* * Per-pipe operation statistics. */ u_int64_t ap_inserts; /* Records added. */ u_int64_t ap_reads; /* Records read. */ u_int64_t ap_drops; /* Records dropped. */ /* * Fields relating to pipe interest: global masks for unmatched * processes (attributable, non-attributable), and a list of specific * interest specifications by auid. */ int ap_preselect_mode; au_mask_t ap_preselect_flags; au_mask_t ap_preselect_naflags; TAILQ_HEAD(, audit_pipe_preselect) ap_preselect_list; /* * Current pending record list. Protected by a combination of ap_mtx * and ap_sx. Note particularly that *both* locks are required to * remove a record from the head of the queue, as an in-progress read * may sleep while copying and therefore cannot hold ap_mtx. */ TAILQ_HEAD(, audit_pipe_entry) ap_queue; /* * Global pipe list. */ TAILQ_ENTRY(audit_pipe) ap_list; }; #define AUDIT_PIPE_LOCK(ap) mtx_lock(&(ap)->ap_mtx) #define AUDIT_PIPE_LOCK_ASSERT(ap) mtx_assert(&(ap)->ap_mtx, MA_OWNED) #define AUDIT_PIPE_LOCK_DESTROY(ap) mtx_destroy(&(ap)->ap_mtx) #define AUDIT_PIPE_LOCK_INIT(ap) mtx_init(&(ap)->ap_mtx, \ "audit_pipe_mtx", NULL, MTX_DEF) #define AUDIT_PIPE_UNLOCK(ap) mtx_unlock(&(ap)->ap_mtx) #define AUDIT_PIPE_MTX(ap) (&(ap)->ap_mtx) #define AUDIT_PIPE_SX_LOCK_DESTROY(ap) sx_destroy(&(ap)->ap_sx) #define AUDIT_PIPE_SX_LOCK_INIT(ap) sx_init(&(ap)->ap_sx, "audit_pipe_sx") #define AUDIT_PIPE_SX_XLOCK_ASSERT(ap) sx_assert(&(ap)->ap_sx, SA_XLOCKED) #define AUDIT_PIPE_SX_XLOCK_SIG(ap) sx_xlock_sig(&(ap)->ap_sx) #define AUDIT_PIPE_SX_XUNLOCK(ap) sx_xunlock(&(ap)->ap_sx) /* * Global list of audit pipes, rwlock to protect it. Individual record * queues on pipes are protected by per-pipe locks; these locks synchronize * between threads walking the list to deliver to individual pipes and add/ * remove of pipes, and are mostly acquired for read. */ static TAILQ_HEAD(, audit_pipe) audit_pipe_list; static struct rwlock audit_pipe_lock; #define AUDIT_PIPE_LIST_LOCK_INIT() rw_init(&audit_pipe_lock, \ "audit_pipe_list_lock") #define AUDIT_PIPE_LIST_RLOCK() rw_rlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_RUNLOCK() rw_runlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_WLOCK() rw_wlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_WLOCK_ASSERT() rw_assert(&audit_pipe_lock, \ RA_WLOCKED) #define AUDIT_PIPE_LIST_WUNLOCK() rw_wunlock(&audit_pipe_lock) /* * Cloning related variables and constants. */ #define AUDIT_PIPE_NAME "auditpipe" static eventhandler_tag audit_pipe_eh_tag; static struct clonedevs *audit_pipe_clones; /* * Special device methods and definition. */ static d_open_t audit_pipe_open; static d_close_t audit_pipe_close; static d_read_t audit_pipe_read; static d_ioctl_t audit_pipe_ioctl; static d_poll_t audit_pipe_poll; static d_kqfilter_t audit_pipe_kqfilter; static struct cdevsw audit_pipe_cdevsw = { .d_version = D_VERSION, .d_flags = D_PSEUDO | D_NEEDMINOR, .d_open = audit_pipe_open, .d_close = audit_pipe_close, .d_read = audit_pipe_read, .d_ioctl = audit_pipe_ioctl, .d_poll = audit_pipe_poll, .d_kqfilter = audit_pipe_kqfilter, .d_name = AUDIT_PIPE_NAME, }; static int audit_pipe_kqread(struct knote *note, long hint); static void audit_pipe_kqdetach(struct knote *note); static struct filterops audit_pipe_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = audit_pipe_kqdetach, .f_event = audit_pipe_kqread, }; /* * Some global statistics on audit pipes. */ static int audit_pipe_count; /* Current number of pipes. */ static u_int64_t audit_pipe_ever; /* Pipes ever allocated. */ static u_int64_t audit_pipe_records; /* Records seen. */ static u_int64_t audit_pipe_drops; /* Global record drop count. */ /* * Free an audit pipe entry. */ static void audit_pipe_entry_free(struct audit_pipe_entry *ape) { free(ape->ape_record, M_AUDIT_PIPE_ENTRY); free(ape, M_AUDIT_PIPE_ENTRY); } /* * Find an audit pipe preselection specification for an auid, if any. */ static struct audit_pipe_preselect * audit_pipe_preselect_find(struct audit_pipe *ap, au_id_t auid) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); TAILQ_FOREACH(app, &ap->ap_preselect_list, app_list) { if (app->app_auid == auid) return (app); } return (NULL); } /* * Query the per-pipe mask for a specific auid. */ static int audit_pipe_preselect_get(struct audit_pipe *ap, au_id_t auid, au_mask_t *maskp) { struct audit_pipe_preselect *app; int error; AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app != NULL) { *maskp = app->app_mask; error = 0; } else error = ENOENT; AUDIT_PIPE_UNLOCK(ap); return (error); } /* * Set the per-pipe mask for a specific auid. Add a new entry if needed; * otherwise, update the current entry. */ static void audit_pipe_preselect_set(struct audit_pipe *ap, au_id_t auid, au_mask_t mask) { struct audit_pipe_preselect *app, *app_new; /* * Pessimistically assume that the auid doesn't already have a mask * set, and allocate. We will free it if it is unneeded. */ app_new = malloc(sizeof(*app_new), M_AUDIT_PIPE_PRESELECT, M_WAITOK); AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app == NULL) { app = app_new; app_new = NULL; app->app_auid = auid; TAILQ_INSERT_TAIL(&ap->ap_preselect_list, app, app_list); } app->app_mask = mask; AUDIT_PIPE_UNLOCK(ap); if (app_new != NULL) free(app_new, M_AUDIT_PIPE_PRESELECT); } /* * Delete a per-auid mask on an audit pipe. */ static int audit_pipe_preselect_delete(struct audit_pipe *ap, au_id_t auid) { struct audit_pipe_preselect *app; int error; AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app != NULL) { TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list); error = 0; } else error = ENOENT; AUDIT_PIPE_UNLOCK(ap); if (app != NULL) free(app, M_AUDIT_PIPE_PRESELECT); return (error); } /* * Delete all per-auid masks on an audit pipe. */ static void audit_pipe_preselect_flush_locked(struct audit_pipe *ap) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); while ((app = TAILQ_FIRST(&ap->ap_preselect_list)) != NULL) { TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list); free(app, M_AUDIT_PIPE_PRESELECT); } } static void audit_pipe_preselect_flush(struct audit_pipe *ap) { AUDIT_PIPE_LOCK(ap); audit_pipe_preselect_flush_locked(ap); AUDIT_PIPE_UNLOCK(ap); } /*- * Determine whether a specific audit pipe matches a record with these * properties. Algorithm is as follows: * * - If the pipe is configured to track the default trail configuration, then * use the results of global preselection matching. * - If not, search for a specifically configured auid entry matching the * event. If an entry is found, use that. * - Otherwise, use the default flags or naflags configured for the pipe. */ static int audit_pipe_preselect_check(struct audit_pipe *ap, au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_preselect) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); switch (ap->ap_preselect_mode) { case AUDITPIPE_PRESELECT_MODE_TRAIL: return (trail_preselect); case AUDITPIPE_PRESELECT_MODE_LOCAL: app = audit_pipe_preselect_find(ap, auid); if (app == NULL) { if (auid == AU_DEFAUDITID) return (au_preselect(event, class, &ap->ap_preselect_naflags, sorf)); else return (au_preselect(event, class, &ap->ap_preselect_flags, sorf)); } else return (au_preselect(event, class, &app->app_mask, sorf)); default: panic("audit_pipe_preselect_check: mode %d", ap->ap_preselect_mode); } return (0); } /* * Determine whether there exists a pipe interested in a record with specific * properties. */ int audit_pipe_preselect(au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_preselect) { struct audit_pipe *ap; /* Lockless read to avoid acquiring the global lock if not needed. */ if (TAILQ_EMPTY(&audit_pipe_list)) return (0); AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); if (audit_pipe_preselect_check(ap, auid, event, class, sorf, trail_preselect)) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_LIST_RUNLOCK(); return (1); } AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); return (0); } /* * Append individual record to a queue -- allocate queue-local buffer, and * add to the queue. If the queue is full or we can't allocate memory, drop * the newest record. */ static void audit_pipe_append(struct audit_pipe *ap, void *record, u_int record_len) { struct audit_pipe_entry *ape; AUDIT_PIPE_LOCK_ASSERT(ap); if (ap->ap_qlen >= ap->ap_qlimit) { ap->ap_drops++; audit_pipe_drops++; return; } ape = malloc(sizeof(*ape), M_AUDIT_PIPE_ENTRY, M_NOWAIT | M_ZERO); if (ape == NULL) { ap->ap_drops++; audit_pipe_drops++; return; } ape->ape_record = malloc(record_len, M_AUDIT_PIPE_ENTRY, M_NOWAIT); if (ape->ape_record == NULL) { free(ape, M_AUDIT_PIPE_ENTRY); ap->ap_drops++; audit_pipe_drops++; return; } bcopy(record, ape->ape_record, record_len); ape->ape_record_len = record_len; TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue); ap->ap_inserts++; ap->ap_qlen++; ap->ap_qbyteslen += ape->ape_record_len; selwakeuppri(&ap->ap_selinfo, PSOCK); KNOTE_LOCKED(&ap->ap_selinfo.si_note, 0); if (ap->ap_flags & AUDIT_PIPE_ASYNC) pgsigio(&ap->ap_sigio, SIGIO, 0); cv_broadcast(&ap->ap_cv); } /* * audit_pipe_submit(): audit_worker submits audit records via this * interface, which arranges for them to be delivered to pipe queues. */ void audit_pipe_submit(au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_select, void *record, u_int record_len) { struct audit_pipe *ap; /* * Lockless read to avoid lock overhead if pipes are not in use. */ if (TAILQ_FIRST(&audit_pipe_list) == NULL) return; AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); if (audit_pipe_preselect_check(ap, auid, event, class, sorf, trail_select)) audit_pipe_append(ap, record, record_len); AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); /* Unlocked increment. */ audit_pipe_records++; } /* * audit_pipe_submit_user(): the same as audit_pipe_submit(), except that * since we don't currently have selection information available, it is * delivered to the pipe unconditionally. * * XXXRW: This is a bug. The BSM check routine for submitting a user record * should parse that information and return it. */ void audit_pipe_submit_user(void *record, u_int record_len) { struct audit_pipe *ap; /* * Lockless read to avoid lock overhead if pipes are not in use. */ if (TAILQ_FIRST(&audit_pipe_list) == NULL) return; AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); audit_pipe_append(ap, record, record_len); AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); /* Unlocked increment. */ audit_pipe_records++; } /* * Allocate a new audit pipe. Connects the pipe, on success, to the global * list and updates statistics. */ static struct audit_pipe * audit_pipe_alloc(void) { struct audit_pipe *ap; AUDIT_PIPE_LIST_WLOCK_ASSERT(); ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO); if (ap == NULL) return (NULL); ap->ap_qlimit = AUDIT_PIPE_QLIMIT_DEFAULT; TAILQ_INIT(&ap->ap_queue); knlist_init_mtx(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap)); AUDIT_PIPE_LOCK_INIT(ap); AUDIT_PIPE_SX_LOCK_INIT(ap); cv_init(&ap->ap_cv, "audit_pipe"); /* * Default flags, naflags, and auid-specific preselection settings to * 0. Initialize the mode to the global trail so that if praudit(1) * is run on /dev/auditpipe, it sees events associated with the * default trail. Pipe-aware application can clear the flag, set * custom masks, and flush the pipe as needed. */ bzero(&ap->ap_preselect_flags, sizeof(ap->ap_preselect_flags)); bzero(&ap->ap_preselect_naflags, sizeof(ap->ap_preselect_naflags)); TAILQ_INIT(&ap->ap_preselect_list); ap->ap_preselect_mode = AUDITPIPE_PRESELECT_MODE_TRAIL; /* * Add to global list and update global statistics. */ TAILQ_INSERT_HEAD(&audit_pipe_list, ap, ap_list); audit_pipe_count++; audit_pipe_ever++; return (ap); } /* * Flush all records currently present in an audit pipe; assume mutex is held. */ static void audit_pipe_flush(struct audit_pipe *ap) { struct audit_pipe_entry *ape; AUDIT_PIPE_LOCK_ASSERT(ap); while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL) { TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue); ap->ap_qbyteslen -= ape->ape_record_len; audit_pipe_entry_free(ape); ap->ap_qlen--; } ap->ap_qoffset = 0; KASSERT(ap->ap_qlen == 0, ("audit_pipe_free: ap_qbyteslen")); KASSERT(ap->ap_qbyteslen == 0, ("audit_pipe_flush: ap_qbyteslen")); } /* * Free an audit pipe; this means freeing all preselection state and all * records in the pipe. Assumes global write lock and pipe mutex are held to * prevent any new records from being inserted during the free, and that the * audit pipe is still on the global list. */ static void audit_pipe_free(struct audit_pipe *ap) { AUDIT_PIPE_LIST_WLOCK_ASSERT(); AUDIT_PIPE_LOCK_ASSERT(ap); audit_pipe_preselect_flush_locked(ap); audit_pipe_flush(ap); cv_destroy(&ap->ap_cv); AUDIT_PIPE_SX_LOCK_DESTROY(ap); AUDIT_PIPE_LOCK_DESTROY(ap); + seldrain(&ap->ap_selinfo); knlist_destroy(&ap->ap_selinfo.si_note); TAILQ_REMOVE(&audit_pipe_list, ap, ap_list); free(ap, M_AUDIT_PIPE); audit_pipe_count--; } /* * Audit pipe clone routine -- provide specific requested audit pipe, or a * fresh one if a specific one is not requested. */ static void audit_pipe_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { int i, u; if (*dev != NULL) return; if (strcmp(name, AUDIT_PIPE_NAME) == 0) u = -1; else if (dev_stdclone(name, NULL, AUDIT_PIPE_NAME, &u) != 1) return; i = clone_create(&audit_pipe_clones, &audit_pipe_cdevsw, &u, dev, 0); if (i) { *dev = make_dev(&audit_pipe_cdevsw, u, UID_ROOT, GID_WHEEL, 0600, "%s%d", AUDIT_PIPE_NAME, u); if (*dev != NULL) { dev_ref(*dev); (*dev)->si_flags |= SI_CHEAPCLONE; } } } /* * Audit pipe open method. Explicit privilege check isn't used as this * allows file permissions on the special device to be used to grant audit * review access. Those file permissions should be managed carefully. */ static int audit_pipe_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct audit_pipe *ap; AUDIT_PIPE_LIST_WLOCK(); ap = dev->si_drv1; if (ap == NULL) { ap = audit_pipe_alloc(); if (ap == NULL) { AUDIT_PIPE_LIST_WUNLOCK(); return (ENOMEM); } dev->si_drv1 = ap; } else { KASSERT(ap->ap_open, ("audit_pipe_open: ap && !ap_open")); AUDIT_PIPE_LIST_WUNLOCK(); return (EBUSY); } ap->ap_open = 1; /* No lock required yet. */ AUDIT_PIPE_LIST_WUNLOCK(); fsetown(td->td_proc->p_pid, &ap->ap_sigio); return (0); } /* * Close audit pipe, tear down all records, etc. */ static int audit_pipe_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct audit_pipe *ap; ap = dev->si_drv1; KASSERT(ap != NULL, ("audit_pipe_close: ap == NULL")); KASSERT(ap->ap_open, ("audit_pipe_close: !ap_open")); funsetown(&ap->ap_sigio); AUDIT_PIPE_LIST_WLOCK(); AUDIT_PIPE_LOCK(ap); ap->ap_open = 0; audit_pipe_free(ap); dev->si_drv1 = NULL; AUDIT_PIPE_LIST_WUNLOCK(); return (0); } /* * Audit pipe ioctl() routine. Handle file descriptor and audit pipe layer * commands. */ static int audit_pipe_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct auditpipe_ioctl_preselect *aip; struct audit_pipe *ap; au_mask_t *maskp; int error, mode; au_id_t auid; ap = dev->si_drv1; KASSERT(ap != NULL, ("audit_pipe_ioctl: ap == NULL")); /* * Audit pipe ioctls: first come standard device node ioctls, then * manipulation of pipe settings, and finally, statistics query * ioctls. */ switch (cmd) { case FIONBIO: AUDIT_PIPE_LOCK(ap); if (*(int *)data) ap->ap_flags |= AUDIT_PIPE_NBIO; else ap->ap_flags &= ~AUDIT_PIPE_NBIO; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIONREAD: AUDIT_PIPE_LOCK(ap); *(int *)data = ap->ap_qbyteslen - ap->ap_qoffset; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIOASYNC: AUDIT_PIPE_LOCK(ap); if (*(int *)data) ap->ap_flags |= AUDIT_PIPE_ASYNC; else ap->ap_flags &= ~AUDIT_PIPE_ASYNC; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIOSETOWN: error = fsetown(*(int *)data, &ap->ap_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&ap->ap_sigio); error = 0; break; case AUDITPIPE_GET_QLEN: *(u_int *)data = ap->ap_qlen; error = 0; break; case AUDITPIPE_GET_QLIMIT: *(u_int *)data = ap->ap_qlimit; error = 0; break; case AUDITPIPE_SET_QLIMIT: /* Lockless integer write. */ if (*(u_int *)data >= AUDIT_PIPE_QLIMIT_MIN || *(u_int *)data <= AUDIT_PIPE_QLIMIT_MAX) { ap->ap_qlimit = *(u_int *)data; error = 0; } else error = EINVAL; break; case AUDITPIPE_GET_QLIMIT_MIN: *(u_int *)data = AUDIT_PIPE_QLIMIT_MIN; error = 0; break; case AUDITPIPE_GET_QLIMIT_MAX: *(u_int *)data = AUDIT_PIPE_QLIMIT_MAX; error = 0; break; case AUDITPIPE_GET_PRESELECT_FLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; *maskp = ap->ap_preselect_flags; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_FLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; ap->ap_preselect_flags = *maskp; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_NAFLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; *maskp = ap->ap_preselect_naflags; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_NAFLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; ap->ap_preselect_naflags = *maskp; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_AUID: aip = (struct auditpipe_ioctl_preselect *)data; error = audit_pipe_preselect_get(ap, aip->aip_auid, &aip->aip_mask); break; case AUDITPIPE_SET_PRESELECT_AUID: aip = (struct auditpipe_ioctl_preselect *)data; audit_pipe_preselect_set(ap, aip->aip_auid, aip->aip_mask); error = 0; break; case AUDITPIPE_DELETE_PRESELECT_AUID: auid = *(au_id_t *)data; error = audit_pipe_preselect_delete(ap, auid); break; case AUDITPIPE_FLUSH_PRESELECT_AUID: audit_pipe_preselect_flush(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_MODE: AUDIT_PIPE_LOCK(ap); *(int *)data = ap->ap_preselect_mode; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_MODE: mode = *(int *)data; switch (mode) { case AUDITPIPE_PRESELECT_MODE_TRAIL: case AUDITPIPE_PRESELECT_MODE_LOCAL: AUDIT_PIPE_LOCK(ap); ap->ap_preselect_mode = mode; AUDIT_PIPE_UNLOCK(ap); error = 0; break; default: error = EINVAL; } break; case AUDITPIPE_FLUSH: if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0) return (EINTR); AUDIT_PIPE_LOCK(ap); audit_pipe_flush(ap); AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); error = 0; break; case AUDITPIPE_GET_MAXAUDITDATA: *(u_int *)data = MAXAUDITDATA; error = 0; break; case AUDITPIPE_GET_INSERTS: *(u_int *)data = ap->ap_inserts; error = 0; break; case AUDITPIPE_GET_READS: *(u_int *)data = ap->ap_reads; error = 0; break; case AUDITPIPE_GET_DROPS: *(u_int *)data = ap->ap_drops; error = 0; break; case AUDITPIPE_GET_TRUNCATES: *(u_int *)data = 0; error = 0; break; default: error = ENOTTY; } return (error); } /* * Audit pipe read. Read one or more partial or complete records to user * memory. */ static int audit_pipe_read(struct cdev *dev, struct uio *uio, int flag) { struct audit_pipe_entry *ape; struct audit_pipe *ap; u_int toread; int error; ap = dev->si_drv1; KASSERT(ap != NULL, ("audit_pipe_read: ap == NULL")); /* * We hold an sx(9) lock over read and flush because we rely on the * stability of a record in the queue during uiomove(9). */ if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0) return (EINTR); AUDIT_PIPE_LOCK(ap); while (TAILQ_EMPTY(&ap->ap_queue)) { if (ap->ap_flags & AUDIT_PIPE_NBIO) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (EAGAIN); } error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap)); if (error) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (error); } } /* * Copy as many remaining bytes from the current record to userspace * as we can. Keep processing records until we run out of records in * the queue, or until the user buffer runs out of space. * * Note: we rely on the SX lock to maintain ape's stability here. */ ap->ap_reads++; while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL && uio->uio_resid > 0) { AUDIT_PIPE_LOCK_ASSERT(ap); KASSERT(ape->ape_record_len > ap->ap_qoffset, ("audit_pipe_read: record_len > qoffset (1)")); toread = MIN(ape->ape_record_len - ap->ap_qoffset, uio->uio_resid); AUDIT_PIPE_UNLOCK(ap); error = uiomove((char *)ape->ape_record + ap->ap_qoffset, toread, uio); if (error) { AUDIT_PIPE_SX_XUNLOCK(ap); return (error); } /* * If the copy succeeded, update book-keeping, and if no * bytes remain in the current record, free it. */ AUDIT_PIPE_LOCK(ap); KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape, ("audit_pipe_read: queue out of sync after uiomove")); ap->ap_qoffset += toread; KASSERT(ape->ape_record_len >= ap->ap_qoffset, ("audit_pipe_read: record_len >= qoffset (2)")); if (ap->ap_qoffset == ape->ape_record_len) { TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue); ap->ap_qbyteslen -= ape->ape_record_len; audit_pipe_entry_free(ape); ap->ap_qlen--; ap->ap_qoffset = 0; } } AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (0); } /* * Audit pipe poll. */ static int audit_pipe_poll(struct cdev *dev, int events, struct thread *td) { struct audit_pipe *ap; int revents; revents = 0; ap = dev->si_drv1; KASSERT(ap != NULL, ("audit_pipe_poll: ap == NULL")); if (events & (POLLIN | POLLRDNORM)) { AUDIT_PIPE_LOCK(ap); if (TAILQ_FIRST(&ap->ap_queue) != NULL) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &ap->ap_selinfo); AUDIT_PIPE_UNLOCK(ap); } return (revents); } /* * Audit pipe kqfilter. */ static int audit_pipe_kqfilter(struct cdev *dev, struct knote *kn) { struct audit_pipe *ap; ap = dev->si_drv1; KASSERT(ap != NULL, ("audit_pipe_kqfilter: ap == NULL")); if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &audit_pipe_read_filterops; kn->kn_hook = ap; AUDIT_PIPE_LOCK(ap); knlist_add(&ap->ap_selinfo.si_note, kn, 1); AUDIT_PIPE_UNLOCK(ap); return (0); } /* * Return true if there are records available for reading on the pipe. */ static int audit_pipe_kqread(struct knote *kn, long hint) { struct audit_pipe *ap; ap = (struct audit_pipe *)kn->kn_hook; KASSERT(ap != NULL, ("audit_pipe_kqread: ap == NULL")); AUDIT_PIPE_LOCK_ASSERT(ap); if (ap->ap_qlen != 0) { kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset; return (1); } else { kn->kn_data = 0; return (0); } } /* * Detach kqueue state from audit pipe. */ static void audit_pipe_kqdetach(struct knote *kn) { struct audit_pipe *ap; ap = (struct audit_pipe *)kn->kn_hook; KASSERT(ap != NULL, ("audit_pipe_kqdetach: ap == NULL")); AUDIT_PIPE_LOCK(ap); knlist_remove(&ap->ap_selinfo.si_note, kn, 1); AUDIT_PIPE_UNLOCK(ap); } /* * Initialize the audit pipe system. */ static void audit_pipe_init(void *unused) { TAILQ_INIT(&audit_pipe_list); AUDIT_PIPE_LIST_LOCK_INIT(); clone_setup(&audit_pipe_clones); audit_pipe_eh_tag = EVENTHANDLER_REGISTER(dev_clone, audit_pipe_clone, 0, 1000); if (audit_pipe_eh_tag == NULL) panic("audit_pipe_init: EVENTHANDLER_REGISTER"); } SYSINIT(audit_pipe_init, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, audit_pipe_init, NULL); Index: stable/8/sys/sys/selinfo.h =================================================================== --- stable/8/sys/sys/selinfo.h (revision 225584) +++ stable/8/sys/sys/selinfo.h (revision 225585) @@ -1,60 +1,61 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)select.h 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #ifndef _SYS_SELINFO_H_ #define _SYS_SELINFO_H_ #include /* for struct klist */ struct selfd; TAILQ_HEAD(selfdlist, selfd); /* * Used to maintain information about processes that wish to be * notified when I/O becomes possible. */ struct selinfo { struct selfdlist si_tdlist; /* List of sleeping threads. */ struct knlist si_note; /* kernel note list */ struct mtx *si_mtx; /* Lock for tdlist. */ }; #define SEL_WAITING(si) (!TAILQ_EMPTY(&(si)->si_tdlist)) #ifdef _KERNEL +void seldrain(struct selinfo *sip); void selrecord(struct thread *selector, struct selinfo *sip); void selwakeup(struct selinfo *sip); void selwakeuppri(struct selinfo *sip, int pri); void seltdfini(struct thread *td); #endif #endif /* !_SYS_SELINFO_H_ */ Index: stable/8/sys =================================================================== --- stable/8/sys (revision 225584) +++ stable/8/sys (revision 225585) Property changes on: stable/8/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r225177