Index: head/share/man/man9/Makefile =================================================================== --- head/share/man/man9/Makefile (revision 350693) +++ head/share/man/man9/Makefile (revision 350694) @@ -1,2294 +1,2295 @@ # $FreeBSD$ .include MAN= accept_filter.9 \ accf_data.9 \ accf_dns.9 \ accf_http.9 \ acl.9 \ alq.9 \ altq.9 \ atomic.9 \ bhnd.9 \ bhnd_erom.9 \ bios.9 \ bitset.9 \ boot.9 \ bpf.9 \ buf.9 \ buf_ring.9 \ BUF_ISLOCKED.9 \ BUF_LOCK.9 \ BUF_LOCKFREE.9 \ BUF_LOCKINIT.9 \ BUF_RECURSED.9 \ BUF_TIMELOCK.9 \ BUF_UNLOCK.9 \ bus_activate_resource.9 \ BUS_ADD_CHILD.9 \ bus_adjust_resource.9 \ bus_alloc_resource.9 \ BUS_BIND_INTR.9 \ bus_child_present.9 \ BUS_CHILD_DELETED.9 \ BUS_CHILD_DETACHED.9 \ BUS_CONFIG_INTR.9 \ BUS_DESCRIBE_INTR.9 \ bus_dma.9 \ bus_generic_attach.9 \ bus_generic_detach.9 \ bus_generic_new_pass.9 \ bus_generic_print_child.9 \ bus_generic_read_ivar.9 \ bus_generic_shutdown.9 \ BUS_GET_CPUS.9 \ bus_get_resource.9 \ bus_map_resource.9 \ BUS_NEW_PASS.9 \ BUS_PRINT_CHILD.9 \ BUS_READ_IVAR.9 \ BUS_RESCAN.9 \ bus_release_resource.9 \ bus_set_pass.9 \ bus_set_resource.9 \ BUS_SETUP_INTR.9 \ bus_space.9 \ byteorder.9 \ casuword.9 \ cd.9 \ cnv.9 \ condvar.9 \ config_intrhook.9 \ contigmalloc.9 \ copy.9 \ counter.9 \ cpuset.9 \ cr_cansee.9 \ critical_enter.9 \ cr_seeothergids.9 \ cr_seeotheruids.9 \ crypto.9 \ CTASSERT.9 \ DB_COMMAND.9 \ DECLARE_GEOM_CLASS.9 \ DECLARE_MODULE.9 \ DEFINE_IFUNC.9 \ DELAY.9 \ devclass.9 \ devclass_find.9 \ devclass_get_device.9 \ devclass_get_devices.9 \ devclass_get_drivers.9 \ devclass_get_maxunit.9 \ devclass_get_name.9 \ devclass_get_softc.9 \ dev_clone.9 \ devfs_set_cdevpriv.9 \ device.9 \ device_add_child.9 \ DEVICE_ATTACH.9 \ device_delete_child.9 \ device_delete_children.9 \ DEVICE_DETACH.9 \ device_enable.9 \ device_find_child.9 \ device_get_children.9 \ device_get_devclass.9 \ device_get_driver.9 \ device_get_ivars.9 \ device_get_name.9 \ device_get_parent.9 \ device_get_softc.9 \ device_get_state.9 \ device_get_sysctl.9 \ device_get_unit.9 \ DEVICE_IDENTIFY.9 \ device_printf.9 \ DEVICE_PROBE.9 \ device_probe_and_attach.9 \ device_quiet.9 \ device_set_desc.9 \ device_set_driver.9 \ device_set_flags.9 \ DEVICE_SHUTDOWN.9 \ DEV_MODULE.9 \ dev_refthread.9 \ devstat.9 \ devtoname.9 \ disk.9 \ dnv.9 \ domain.9 \ domainset.9 \ dpcpu.9 \ drbr.9 \ driver.9 \ DRIVER_MODULE.9 \ efirt.9 \ epoch.9 \ EVENTHANDLER.9 \ eventtimers.9 \ extattr.9 \ fail.9 \ fdt_pinctrl.9 \ fetch.9 \ firmware.9 \ fpu_kern.9 \ g_access.9 \ g_attach.9 \ g_bio.9 \ g_consumer.9 \ g_data.9 \ get_cyclecount.9 \ getenv.9 \ getnewvnode.9 \ g_event.9 \ g_geom.9 \ g_provider.9 \ g_provider_by_name.9 \ groupmember.9 \ g_wither_geom.9 \ hash.9 \ hashinit.9 \ hexdump.9 \ hhook.9 \ ieee80211.9 \ ieee80211_amrr.9 \ ieee80211_beacon.9 \ ieee80211_bmiss.9 \ ieee80211_crypto.9 \ ieee80211_ddb.9 \ ieee80211_input.9 \ ieee80211_node.9 \ ieee80211_output.9 \ ieee80211_proto.9 \ ieee80211_radiotap.9 \ ieee80211_regdomain.9 \ ieee80211_scan.9 \ ieee80211_vap.9 \ iflib.9 \ iflibdd.9 \ iflibdi.9 \ iflibtxrx.9 \ ifnet.9 \ inittodr.9 \ insmntque.9 \ intro.9 \ ithread.9 \ KASSERT.9 \ kern_testfrwk.9 \ kernacc.9 \ kernel_mount.9 \ khelp.9 \ kobj.9 \ kproc.9 \ kqueue.9 \ kthread.9 \ ktr.9 \ lock.9 \ locking.9 \ LOCK_PROFILING.9 \ mac.9 \ make_dev.9 \ malloc.9 \ mbchain.9 \ mbuf.9 \ mbuf_tags.9 \ MD5.9 \ mdchain.9 \ memcchr.9 \ memguard.9 \ microseq.9 \ microtime.9 \ microuptime.9 \ mi_switch.9 \ mod_cc.9 \ module.9 \ MODULE_DEPEND.9 \ MODULE_PNP_INFO.9 \ MODULE_VERSION.9 \ mtx_pool.9 \ mutex.9 \ namei.9 \ netisr.9 \ nv.9 \ OF_child.9 \ OF_device_from_xref.9 \ OF_finddevice.9 \ OF_getprop.9 \ OF_node_from_xref.9 \ OF_package_to_path.9 \ ofw_bus_is_compatible.9 \ ofw_bus_status_okay.9 \ osd.9 \ owll.9 \ own.9 \ panic.9 \ PCBGROUP.9 \ p_candebug.9 \ p_cansee.9 \ pci.9 \ PCI_IOV_ADD_VF.9 \ PCI_IOV_INIT.9 \ pci_iov_schema.9 \ PCI_IOV_UNINIT.9 \ pfil.9 \ pfind.9 \ pget.9 \ pgfind.9 \ PHOLD.9 \ physio.9 \ pmap.9 \ pmap_activate.9 \ pmap_clear_modify.9 \ pmap_copy.9 \ pmap_enter.9 \ pmap_extract.9 \ pmap_growkernel.9 \ pmap_init.9 \ pmap_is_modified.9 \ pmap_is_prefaultable.9 \ pmap_map.9 \ pmap_mincore.9 \ pmap_object_init_pt.9 \ pmap_page_exists_quick.9 \ pmap_page_init.9 \ pmap_pinit.9 \ pmap_protect.9 \ pmap_qenter.9 \ pmap_quick_enter_page.9 \ pmap_release.9 \ pmap_remove.9 \ pmap_resident_count.9 \ pmap_unwire.9 \ pmap_zero_page.9 \ printf.9 \ prison_check.9 \ priv.9 \ proc_rwmem.9 \ pseudofs.9 \ psignal.9 \ pwmbus.9 \ random.9 \ random_harvest.9 \ ratecheck.9 \ redzone.9 \ refcount.9 \ resettodr.9 \ resource_int_value.9 \ rijndael.9 \ rman.9 \ rmlock.9 \ rtalloc.9 \ rtentry.9 \ runqueue.9 \ rwlock.9 \ sbuf.9 \ scheduler.9 \ SDT.9 \ securelevel_gt.9 \ selrecord.9 \ sema.9 \ seqc.9 \ sf_buf.9 \ sglist.9 \ shm_map.9 \ signal.9 \ sleep.9 \ sleepqueue.9 \ socket.9 \ stack.9 \ store.9 \ style.9 \ style.lua.9 \ swi.9 \ sx.9 \ syscall_helper_register.9 \ SYSCALL_MODULE.9 \ sysctl.9 \ sysctl_add_oid.9 \ sysctl_ctx_init.9 \ SYSINIT.9 \ taskqueue.9 \ tcp_functions.9 \ thread_exit.9 \ time.9 \ timeout.9 \ tvtohz.9 \ ucred.9 \ uidinfo.9 \ uio.9 \ unr.9 \ vaccess.9 \ vaccess_acl_nfs4.9 \ vaccess_acl_posix1e.9 \ vcount.9 \ vflush.9 \ VFS.9 \ vfs_busy.9 \ VFS_CHECKEXP.9 \ vfsconf.9 \ VFS_FHTOVP.9 \ vfs_getnewfsid.9 \ vfs_getopt.9 \ vfs_getvfs.9 \ VFS_MOUNT.9 \ vfs_mountedfrom.9 \ VFS_QUOTACTL.9 \ VFS_ROOT.9 \ vfs_rootmountalloc.9 \ VFS_SET.9 \ VFS_STATFS.9 \ vfs_suser.9 \ VFS_SYNC.9 \ vfs_timestamp.9 \ vfs_unbusy.9 \ VFS_UNMOUNT.9 \ vfs_unmountall.9 \ VFS_VGET.9 \ vget.9 \ vgone.9 \ vhold.9 \ vinvalbuf.9 \ vm_fault_prefault.9 \ vm_map.9 \ vm_map_check_protection.9 \ vm_map_create.9 \ vm_map_delete.9 \ vm_map_entry_resize_free.9 \ vm_map_find.9 \ vm_map_findspace.9 \ vm_map_inherit.9 \ vm_map_init.9 \ vm_map_insert.9 \ vm_map_lock.9 \ vm_map_lookup.9 \ vm_map_madvise.9 \ vm_map_max.9 \ vm_map_protect.9 \ vm_map_remove.9 \ vm_map_simplify_entry.9 \ vm_map_stack.9 \ vm_map_submap.9 \ vm_map_sync.9 \ vm_map_wire.9 \ vm_page_alloc.9 \ vm_page_bits.9 \ vm_page_busy.9 \ vm_page_deactivate.9 \ vm_page_dontneed.9 \ vm_page_aflag.9 \ vm_page_free.9 \ vm_page_grab.9 \ vm_page_insert.9 \ vm_page_lookup.9 \ vm_page_rename.9 \ vm_page_wire.9 \ vm_set_page_size.9 \ vmem.9 \ vn_fullpath.9 \ vn_isdisk.9 \ vnet.9 \ vnode.9 \ VOP_ACCESS.9 \ VOP_ACLCHECK.9 \ VOP_ADVISE.9 \ VOP_ADVLOCK.9 \ VOP_ALLOCATE.9 \ VOP_ATTRIB.9 \ VOP_BMAP.9 \ VOP_BWRITE.9 \ VOP_COPY_FILE_RANGE.9 \ VOP_CREATE.9 \ VOP_FSYNC.9 \ VOP_GETACL.9 \ VOP_GETEXTATTR.9 \ VOP_GETPAGES.9 \ VOP_INACTIVE.9 \ VOP_IOCTL.9 \ VOP_LINK.9 \ VOP_LISTEXTATTR.9 \ VOP_LOCK.9 \ VOP_LOOKUP.9 \ VOP_OPENCLOSE.9 \ VOP_PATHCONF.9 \ VOP_PRINT.9 \ VOP_RDWR.9 \ VOP_READDIR.9 \ VOP_READLINK.9 \ VOP_REALLOCBLKS.9 \ VOP_REMOVE.9 \ VOP_RENAME.9 \ VOP_REVOKE.9 \ VOP_SETACL.9 \ VOP_SETEXTATTR.9 \ VOP_STRATEGY.9 \ VOP_VPTOCNP.9 \ VOP_VPTOFH.9 \ vref.9 \ vrefcnt.9 \ vrele.9 \ vslock.9 \ watchdog.9 \ zone.9 MLINKS= unr.9 alloc_unr.9 \ unr.9 alloc_unrl.9 \ unr.9 alloc_unr_specific.9 \ unr.9 clear_unrhdr.9 \ unr.9 delete_unrhdr.9 \ unr.9 free_unr.9 \ unr.9 new_unrhdr.9 MLINKS+=accept_filter.9 accept_filt_add.9 \ accept_filter.9 accept_filt_del.9 \ accept_filter.9 accept_filt_generic_mod_event.9 \ accept_filter.9 accept_filt_get.9 MLINKS+=alq.9 ALQ.9 \ alq.9 alq_close.9 \ alq.9 alq_flush.9 \ alq.9 alq_get.9 \ alq.9 alq_getn.9 \ alq.9 alq_open.9 \ alq.9 alq_open_flags.9 \ alq.9 alq_post.9 \ alq.9 alq_post_flags.9 \ alq.9 alq_write.9 \ alq.9 alq_writen.9 MLINKS+=altq.9 ALTQ.9 MLINKS+=atomic.9 atomic_add.9 \ atomic.9 atomic_clear.9 \ atomic.9 atomic_cmpset.9 \ atomic.9 atomic_fcmpset.9 \ atomic.9 atomic_fetchadd.9 \ atomic.9 atomic_load.9 \ atomic.9 atomic_readandclear.9 \ atomic.9 atomic_set.9 \ atomic.9 atomic_store.9 \ atomic.9 atomic_subtract.9 \ atomic.9 atomic_swap.9 \ atomic.9 atomic_testandclear.9 \ atomic.9 atomic_testandset.9 \ atomic.9 atomic_thread_fence.9 MLINKS+=bhnd.9 BHND_MATCH_BOARD_TYPE.9 \ bhnd.9 BHND_MATCH_BOARD_VENDOR.9 \ bhnd.9 BHND_MATCH_CHIP_ID.9 \ bhnd.9 BHND_MATCH_CHIP_PKG.9 \ bhnd.9 BHND_MATCH_CHIP_REV.9 \ bhnd.9 BHND_MATCH_CORE_ID.9 \ bhnd.9 BHND_MATCH_CORE_VENDOR.9 \ bhnd.9 bhnd_activate_resource.9 \ bhnd.9 bhnd_alloc_pmu.9 \ bhnd.9 bhnd_alloc_resource.9 \ bhnd.9 bhnd_alloc_resource_any.9 \ bhnd.9 bhnd_alloc_resources.9 \ bhnd.9 bhnd_board_matches.9 \ bhnd.9 bhnd_bus_match_child.9 \ bhnd.9 bhnd_bus_read_1.9 \ bhnd.9 bhnd_bus_read_2.9 \ bhnd.9 bhnd_bus_read_4.9 \ bhnd.9 bhnd_bus_read_stream_1.9 \ bhnd.9 bhnd_bus_read_stream_2.9 \ bhnd.9 bhnd_bus_read_stream_4.9 \ bhnd.9 bhnd_bus_write_1.9 \ bhnd.9 bhnd_bus_write_2.9 \ bhnd.9 bhnd_bus_write_4.9 \ bhnd.9 bhnd_bus_write_stream_1.9 \ bhnd.9 bhnd_bus_write_stream_2.9 \ bhnd.9 bhnd_bus_write_stream_4.9 \ bhnd.9 bhnd_chip_matches.9 \ bhnd.9 bhnd_core_class.9 \ bhnd.9 bhnd_core_get_match_desc.9 \ bhnd.9 bhnd_core_matches.9 \ bhnd.9 bhnd_core_name.9 \ bhnd.9 bhnd_cores_equal.9 \ bhnd.9 bhnd_deactivate_resource.9 \ bhnd.9 bhnd_decode_port_rid.9 \ bhnd.9 bhnd_deregister_provider.9 \ bhnd.9 bhnd_device_lookup.9 \ bhnd.9 bhnd_device_matches.9 \ bhnd.9 bhnd_device_quirks.9 \ bhnd.9 bhnd_driver_get_erom_class.9 \ bhnd.9 bhnd_enable_clocks.9 \ bhnd.9 bhnd_find_core_class.9 \ bhnd.9 bhnd_find_core_name.9 \ bhnd.9 bhnd_format_chip_id.9 \ bhnd.9 bhnd_get_attach_type.9 \ bhnd.9 bhnd_get_chipid.9 \ bhnd.9 bhnd_get_class.9 \ bhnd.9 bhnd_get_clock_freq.9 \ bhnd.9 bhnd_get_clock_latency.9 \ bhnd.9 bhnd_get_core_index.9 \ bhnd.9 bhnd_get_core_info.9 \ bhnd.9 bhnd_get_core_unit.9 \ bhnd.9 bhnd_get_device.9 \ bhnd.9 bhnd_get_device_name.9 \ bhnd.9 bhnd_get_dma_translation.9 \ bhnd.9 bhnd_get_hwrev.9 \ bhnd.9 bhnd_get_intr_count.9 \ bhnd.9 bhnd_get_intr_ivec.9 \ bhnd.9 bhnd_get_port_count.9 \ bhnd.9 bhnd_get_port_rid.9 \ bhnd.9 bhnd_get_region_addr.9 \ bhnd.9 bhnd_get_region_count.9 \ bhnd.9 bhnd_get_vendor.9 \ bhnd.9 bhnd_get_vendor_name.9 \ bhnd.9 bhnd_hwrev_matches.9 \ bhnd.9 bhnd_is_hw_suspended.9 \ bhnd.9 bhnd_is_region_valid.9 \ bhnd.9 bhnd_map_intr.9 \ bhnd.9 bhnd_match_core.9 \ bhnd.9 bhnd_nvram_getvar.9 \ bhnd.9 bhnd_nvram_getvar_array.9 \ bhnd.9 bhnd_nvram_getvar_int.9 \ bhnd.9 bhnd_nvram_getvar_int16.9 \ bhnd.9 bhnd_nvram_getvar_int32.9 \ bhnd.9 bhnd_nvram_getvar_int8.9 \ bhnd.9 bhnd_nvram_getvar_str.9 \ bhnd.9 bhnd_nvram_getvar_uint.9 \ bhnd.9 bhnd_nvram_getvar_uint16.9 \ bhnd.9 bhnd_nvram_getvar_uint32.9 \ bhnd.9 bhnd_nvram_getvar_uint8.9 \ bhnd.9 bhnd_nvram_string_array_next.9 \ bhnd.9 bhnd_read_board_info.9 \ bhnd.9 bhnd_read_config.9 \ bhnd.9 bhnd_read_ioctl.9 \ bhnd.9 bhnd_read_iost.9 \ bhnd.9 bhnd_register_provider.9 \ bhnd.9 bhnd_release_ext_rsrc.9 \ bhnd.9 bhnd_release_pmu.9 \ bhnd.9 bhnd_release_provider.9 \ bhnd.9 bhnd_release_resource.9 \ bhnd.9 bhnd_release_resources.9 \ bhnd.9 bhnd_request_clock.9 \ bhnd.9 bhnd_request_ext_rsrc.9 \ bhnd.9 bhnd_reset_hw.9 \ bhnd.9 bhnd_retain_provider.9 \ bhnd.9 bhnd_set_custom_core_desc.9 \ bhnd.9 bhnd_set_default_core_desc.9 \ bhnd.9 bhnd_suspend_hw.9 \ bhnd.9 bhnd_unmap_intr.9 \ bhnd.9 bhnd_vendor_name.9 \ bhnd.9 bhnd_write_config.9 \ bhnd.9 bhnd_write_ioctl.9 MLINKS+=bhnd_erom.9 bhnd_erom_alloc.9 \ bhnd_erom.9 bhnd_erom_dump.9 \ bhnd_erom.9 bhnd_erom_fini_static.9 \ bhnd_erom.9 bhnd_erom_free.9 \ bhnd_erom.9 bhnd_erom_free_core_table.9 \ bhnd_erom.9 bhnd_erom_get_core_table.9 \ bhnd_erom.9 bhnd_erom_init_static.9 \ bhnd_erom.9 bhnd_erom_io.9 \ bhnd_erom.9 bhnd_erom_io_fini.9 \ bhnd_erom.9 bhnd_erom_io_map.9 \ bhnd_erom.9 bhnd_erom_io_read.9 \ bhnd_erom.9 bhnd_erom_iobus_init.9 \ bhnd_erom.9 bhnd_erom_iores_new.9 \ bhnd_erom.9 bhnd_erom_lookup_core.9 \ bhnd_erom.9 bhnd_erom_lookup_core_addr.9 \ bhnd_erom.9 bhnd_erom_probe.9 \ bhnd_erom.9 bhnd_erom_probe_driver_classes.9 MLINKS+=bitset.9 BITSET_DEFINE.9 \ bitset.9 BITSET_T_INITIALIZER.9 \ bitset.9 BITSET_FSET.9 \ bitset.9 BIT_CLR.9 \ bitset.9 BIT_COPY.9 \ bitset.9 BIT_ISSET.9 \ bitset.9 BIT_SET.9 \ bitset.9 BIT_ZERO.9 \ bitset.9 BIT_FILL.9 \ bitset.9 BIT_SETOF.9 \ bitset.9 BIT_EMPTY.9 \ bitset.9 BIT_ISFULLSET.9 \ bitset.9 BIT_FFS.9 \ bitset.9 BIT_COUNT.9 \ bitset.9 BIT_SUBSET.9 \ bitset.9 BIT_OVERLAP.9 \ bitset.9 BIT_CMP.9 \ bitset.9 BIT_OR.9 \ bitset.9 BIT_AND.9 \ bitset.9 BIT_NAND.9 \ bitset.9 BIT_CLR_ATOMIC.9 \ bitset.9 BIT_SET_ATOMIC.9 \ bitset.9 BIT_SET_ATOMIC_ACQ.9 \ bitset.9 BIT_AND_ATOMIC.9 \ bitset.9 BIT_OR_ATOMIC.9 \ bitset.9 BIT_COPY_STORE_REL.9 MLINKS+=bpf.9 bpfattach.9 \ bpf.9 bpfattach2.9 \ bpf.9 bpfdetach.9 \ bpf.9 bpf_filter.9 \ bpf.9 bpf_mtap.9 \ bpf.9 bpf_mtap2.9 \ bpf.9 bpf_tap.9 \ bpf.9 bpf_validate.9 MLINKS+=buf.9 bp.9 MLINKS+=buf_ring.9 buf_ring_alloc.9 \ buf_ring.9 buf_ring_free.9 \ buf_ring.9 buf_ring_enqueue.9 \ buf_ring.9 buf_ring_enqueue_bytes.9 \ buf_ring.9 buf_ring_dequeue_mc.9 \ buf_ring.9 buf_ring_dequeue_sc.9 \ buf_ring.9 buf_ring_count.9 \ buf_ring.9 buf_ring_empty.9 \ buf_ring.9 buf_ring_full.9 \ buf_ring.9 buf_ring_peek.9 MLINKS+=bus_activate_resource.9 bus_deactivate_resource.9 MLINKS+=bus_alloc_resource.9 bus_alloc_resource_any.9 MLINKS+=BUS_BIND_INTR.9 bus_bind_intr.9 MLINKS+=BUS_DESCRIBE_INTR.9 bus_describe_intr.9 MLINKS+=bus_dma.9 busdma.9 \ bus_dma.9 bus_dmamap_create.9 \ bus_dma.9 bus_dmamap_destroy.9 \ bus_dma.9 bus_dmamap_load.9 \ bus_dma.9 bus_dmamap_load_bio.9 \ bus_dma.9 bus_dmamap_load_ccb.9 \ bus_dma.9 bus_dmamap_load_mbuf.9 \ bus_dma.9 bus_dmamap_load_mbuf_sg.9 \ bus_dma.9 bus_dmamap_load_uio.9 \ bus_dma.9 bus_dmamap_sync.9 \ bus_dma.9 bus_dmamap_unload.9 \ bus_dma.9 bus_dmamem_alloc.9 \ bus_dma.9 bus_dmamem_free.9 \ bus_dma.9 bus_dma_tag_create.9 \ bus_dma.9 bus_dma_tag_destroy.9 MLINKS+=bus_generic_read_ivar.9 bus_generic_write_ivar.9 MLINKS+=BUS_GET_CPUS.9 bus_get_cpus.9 MLINKS+=bus_map_resource.9 bus_unmap_resource.9 \ bus_map_resource.9 resource_init_map_request.9 MLINKS+=BUS_READ_IVAR.9 BUS_WRITE_IVAR.9 MLINKS+=BUS_SETUP_INTR.9 bus_setup_intr.9 \ BUS_SETUP_INTR.9 BUS_TEARDOWN_INTR.9 \ BUS_SETUP_INTR.9 bus_teardown_intr.9 MLINKS+=bus_space.9 bus_space_alloc.9 \ bus_space.9 bus_space_barrier.9 \ bus_space.9 bus_space_copy_region_1.9 \ bus_space.9 bus_space_copy_region_2.9 \ bus_space.9 bus_space_copy_region_4.9 \ bus_space.9 bus_space_copy_region_8.9 \ bus_space.9 bus_space_copy_region_stream_1.9 \ bus_space.9 bus_space_copy_region_stream_2.9 \ bus_space.9 bus_space_copy_region_stream_4.9 \ bus_space.9 bus_space_copy_region_stream_8.9 \ bus_space.9 bus_space_free.9 \ bus_space.9 bus_space_map.9 \ bus_space.9 bus_space_read_1.9 \ bus_space.9 bus_space_read_2.9 \ bus_space.9 bus_space_read_4.9 \ bus_space.9 bus_space_read_8.9 \ bus_space.9 bus_space_read_multi_1.9 \ bus_space.9 bus_space_read_multi_2.9 \ bus_space.9 bus_space_read_multi_4.9 \ bus_space.9 bus_space_read_multi_8.9 \ bus_space.9 bus_space_read_multi_stream_1.9 \ bus_space.9 bus_space_read_multi_stream_2.9 \ bus_space.9 bus_space_read_multi_stream_4.9 \ bus_space.9 bus_space_read_multi_stream_8.9 \ bus_space.9 bus_space_read_region_1.9 \ bus_space.9 bus_space_read_region_2.9 \ bus_space.9 bus_space_read_region_4.9 \ bus_space.9 bus_space_read_region_8.9 \ bus_space.9 bus_space_read_region_stream_1.9 \ bus_space.9 bus_space_read_region_stream_2.9 \ bus_space.9 bus_space_read_region_stream_4.9 \ bus_space.9 bus_space_read_region_stream_8.9 \ bus_space.9 bus_space_read_stream_1.9 \ bus_space.9 bus_space_read_stream_2.9 \ bus_space.9 bus_space_read_stream_4.9 \ bus_space.9 bus_space_read_stream_8.9 \ bus_space.9 bus_space_set_multi_1.9 \ bus_space.9 bus_space_set_multi_2.9 \ bus_space.9 bus_space_set_multi_4.9 \ bus_space.9 bus_space_set_multi_8.9 \ bus_space.9 bus_space_set_multi_stream_1.9 \ bus_space.9 bus_space_set_multi_stream_2.9 \ bus_space.9 bus_space_set_multi_stream_4.9 \ bus_space.9 bus_space_set_multi_stream_8.9 \ bus_space.9 bus_space_set_region_1.9 \ bus_space.9 bus_space_set_region_2.9 \ bus_space.9 bus_space_set_region_4.9 \ bus_space.9 bus_space_set_region_8.9 \ bus_space.9 bus_space_set_region_stream_1.9 \ bus_space.9 bus_space_set_region_stream_2.9 \ bus_space.9 bus_space_set_region_stream_4.9 \ bus_space.9 bus_space_set_region_stream_8.9 \ bus_space.9 bus_space_subregion.9 \ bus_space.9 bus_space_unmap.9 \ bus_space.9 bus_space_write_1.9 \ bus_space.9 bus_space_write_2.9 \ bus_space.9 bus_space_write_4.9 \ bus_space.9 bus_space_write_8.9 \ bus_space.9 bus_space_write_multi_1.9 \ bus_space.9 bus_space_write_multi_2.9 \ bus_space.9 bus_space_write_multi_4.9 \ bus_space.9 bus_space_write_multi_8.9 \ bus_space.9 bus_space_write_multi_stream_1.9 \ bus_space.9 bus_space_write_multi_stream_2.9 \ bus_space.9 bus_space_write_multi_stream_4.9 \ bus_space.9 bus_space_write_multi_stream_8.9 \ bus_space.9 bus_space_write_region_1.9 \ bus_space.9 bus_space_write_region_2.9 \ bus_space.9 bus_space_write_region_4.9 \ bus_space.9 bus_space_write_region_8.9 \ bus_space.9 bus_space_write_region_stream_1.9 \ bus_space.9 bus_space_write_region_stream_2.9 \ bus_space.9 bus_space_write_region_stream_4.9 \ bus_space.9 bus_space_write_region_stream_8.9 \ bus_space.9 bus_space_write_stream_1.9 \ bus_space.9 bus_space_write_stream_2.9 \ bus_space.9 bus_space_write_stream_4.9 \ bus_space.9 bus_space_write_stream_8.9 MLINKS+=byteorder.9 be16dec.9 \ byteorder.9 be16enc.9 \ byteorder.9 be16toh.9 \ byteorder.9 be32dec.9 \ byteorder.9 be32enc.9 \ byteorder.9 be32toh.9 \ byteorder.9 be64dec.9 \ byteorder.9 be64enc.9 \ byteorder.9 be64toh.9 \ byteorder.9 bswap16.9 \ byteorder.9 bswap32.9 \ byteorder.9 bswap64.9 \ byteorder.9 htobe16.9 \ byteorder.9 htobe32.9 \ byteorder.9 htobe64.9 \ byteorder.9 htole16.9 \ byteorder.9 htole32.9 \ byteorder.9 htole64.9 \ byteorder.9 le16dec.9 \ byteorder.9 le16enc.9 \ byteorder.9 le16toh.9 \ byteorder.9 le32dec.9 \ byteorder.9 le32enc.9 \ byteorder.9 le32toh.9 \ byteorder.9 le64dec.9 \ byteorder.9 le64enc.9 \ byteorder.9 le64toh.9 MLINKS+=cnv.9 cnvlist.9 \ cnv.9 cnvlist_free_binary.9 \ cnv.9 cnvlist_free_bool.9 \ cnv.9 cnvlist_free_bool_array.9 \ cnv.9 cnvlist_free_descriptor.9 \ cnv.9 cnvlist_free_descriptor_array.9 \ cnv.9 cnvlist_free_null.9 \ cnv.9 cnvlist_free_number.9 \ cnv.9 cnvlist_free_number_array.9 \ cnv.9 cnvlist_free_nvlist.9 \ cnv.9 cnvlist_free_nvlist_array.9 \ cnv.9 cnvlist_free_string.9 \ cnv.9 cnvlist_free_string_array.9 \ cnv.9 cnvlist_get_binary.9 \ cnv.9 cnvlist_get_bool.9 \ cnv.9 cnvlist_get_bool_array.9 \ cnv.9 cnvlist_get_descriptor.9 \ cnv.9 cnvlist_get_descriptor_array.9 \ cnv.9 cnvlist_get_number.9 \ cnv.9 cnvlist_get_number_array.9 \ cnv.9 cnvlist_get_nvlist.9 \ cnv.9 cnvlist_get_nvlist_array.9 \ cnv.9 cnvlist_get_string.9 \ cnv.9 cnvlist_get_string_array.9 \ cnv.9 cnvlist_take_binary.9 \ cnv.9 cnvlist_take_bool.9 \ cnv.9 cnvlist_take_bool_array.9 \ cnv.9 cnvlist_take_descriptor.9 \ cnv.9 cnvlist_take_descriptor_array.9 \ cnv.9 cnvlist_take_number.9 \ cnv.9 cnvlist_take_number_array.9 \ cnv.9 cnvlist_take_nvlist.9 \ cnv.9 cnvlist_take_nvlist_array.9 \ cnv.9 cnvlist_take_string.9 \ cnv.9 cnvlist_take_string_array.9 MLINKS+=condvar.9 cv_broadcast.9 \ condvar.9 cv_broadcastpri.9 \ condvar.9 cv_destroy.9 \ condvar.9 cv_init.9 \ condvar.9 cv_signal.9 \ condvar.9 cv_timedwait.9 \ condvar.9 cv_timedwait_sig.9 \ condvar.9 cv_timedwait_sig_sbt.9 \ condvar.9 cv_wait.9 \ condvar.9 cv_wait_sig.9 \ condvar.9 cv_wait_unlock.9 \ condvar.9 cv_wmesg.9 MLINKS+=config_intrhook.9 config_intrhook_disestablish.9 \ config_intrhook.9 config_intrhook_establish.9 \ config_intrhook.9 config_intrhook_oneshot.9 MLINKS+=contigmalloc.9 contigmalloc_domainset.9 \ contigmalloc.9 contigfree.9 MLINKS+=casuword.9 casueword.9 \ casuword.9 casueword32.9 \ casuword.9 casuword32.9 MLINKS+=copy.9 copyin.9 \ copy.9 copyin_nofault.9 \ copy.9 copyinstr.9 \ copy.9 copyout.9 \ copy.9 copyout_nofault.9 \ copy.9 copystr.9 MLINKS+=counter.9 counter_u64_alloc.9 \ counter.9 counter_u64_free.9 \ counter.9 counter_u64_add.9 \ counter.9 counter_enter.9 \ counter.9 counter_exit.9 \ counter.9 counter_u64_add_protected.9 \ counter.9 counter_u64_fetch.9 \ counter.9 counter_u64_zero.9 \ counter.9 SYSCTL_COUNTER_U64.9 \ counter.9 SYSCTL_ADD_COUNTER_U64.9 \ counter.9 SYSCTL_COUNTER_U64_ARRAY.9 \ counter.9 SYSCTL_ADD_COUNTER_U64_ARRAY.9 MLINKS+=cpuset.9 CPUSET_T_INITIALIZER.9 \ cpuset.9 CPUSET_FSET.9 \ cpuset.9 CPU_CLR.9 \ cpuset.9 CPU_COPY.9 \ cpuset.9 CPU_ISSET.9 \ cpuset.9 CPU_SET.9 \ cpuset.9 CPU_ZERO.9 \ cpuset.9 CPU_FILL.9 \ cpuset.9 CPU_SETOF.9 \ cpuset.9 CPU_EMPTY.9 \ cpuset.9 CPU_ISFULLSET.9 \ cpuset.9 CPU_FFS.9 \ cpuset.9 CPU_COUNT.9 \ cpuset.9 CPU_SUBSET.9 \ cpuset.9 CPU_OVERLAP.9 \ cpuset.9 CPU_CMP.9 \ cpuset.9 CPU_OR.9 \ cpuset.9 CPU_AND.9 \ cpuset.9 CPU_NAND.9 \ cpuset.9 CPU_CLR_ATOMIC.9 \ cpuset.9 CPU_SET_ATOMIC.9 \ cpuset.9 CPU_SET_ATOMIC_ACQ.9 \ cpuset.9 CPU_AND_ATOMIC.9 \ cpuset.9 CPU_OR_ATOMIC.9 \ cpuset.9 CPU_COPY_STORE_REL.9 MLINKS+=critical_enter.9 critical.9 \ critical_enter.9 critical_exit.9 MLINKS+=crypto.9 crypto_dispatch.9 \ crypto.9 crypto_done.9 \ crypto.9 crypto_freereq.9 \ crypto.9 crypto_freesession.9 \ crypto.9 crypto_get_driverid.9 \ crypto.9 crypto_getreq.9 \ crypto.9 crypto_kdispatch.9 \ crypto.9 crypto_kdone.9 \ crypto.9 crypto_kregister.9 \ crypto.9 crypto_newsession.9 \ crypto.9 crypto_register.9 \ crypto.9 crypto_unblock.9 \ crypto.9 crypto_unregister.9 \ crypto.9 crypto_unregister_all.9 MLINKS+=DB_COMMAND.9 DB_SHOW_ALL_COMMAND.9 \ DB_COMMAND.9 DB_SHOW_COMMAND.9 MLINKS+=DECLARE_MODULE.9 DECLARE_MODULE_TIED.9 MLINKS+=dev_clone.9 drain_dev_clone_events.9 MLINKS+=dev_refthread.9 devvn_refthread.9 \ dev_refthread.9 dev_relthread.9 MLINKS+=devfs_set_cdevpriv.9 devfs_clear_cdevpriv.9 \ devfs_set_cdevpriv.9 devfs_get_cdevpriv.9 MLINKS+=device_add_child.9 device_add_child_ordered.9 MLINKS+=device_enable.9 device_disable.9 \ device_enable.9 device_is_enabled.9 MLINKS+=device_get_ivars.9 device_set_ivars.9 MLINKS+=device_get_name.9 device_get_nameunit.9 MLINKS+=device_get_state.9 device_busy.9 \ device_get_state.9 device_is_alive.9 \ device_get_state.9 device_is_attached.9 \ device_get_state.9 device_unbusy.9 MLINKS+=device_get_sysctl.9 device_get_sysctl_ctx.9 \ device_get_sysctl.9 device_get_sysctl_tree.9 MLINKS+=device_quiet.9 device_is_quiet.9 \ device_quiet.9 device_verbose.9 MLINKS+=device_set_desc.9 device_get_desc.9 \ device_set_desc.9 device_set_desc_copy.9 MLINKS+=device_set_flags.9 device_get_flags.9 MLINKS+=devstat.9 devicestat.9 \ devstat.9 devstat_add_entry.9 \ devstat.9 devstat_end_transaction.9 \ devstat.9 devstat_remove_entry.9 \ devstat.9 devstat_start_transaction.9 MLINKS+=disk.9 disk_add_alias.9 \ disk.9 disk_alloc.9 \ disk.9 disk_create.9 \ disk.9 disk_destroy.9 \ disk.9 disk_gone.9 \ disk.9 disk_resize.9 MLINKS+=dnv.9 dnvlist.9 \ dnv.9 dnvlist_get_binary.9 \ dnv.9 dnvlist_get_bool.9 \ dnv.9 dnvlist_get_descriptor.9 \ dnv.9 dnvlist_get_number.9 \ dnv.9 dnvlist_get_nvlist.9 \ dnv.9 dnvlist_get_string.9 \ dnv.9 dnvlist_take_binary.9 \ dnv.9 dnvlist_take_bool.9 \ dnv.9 dnvlist_take_descriptor.9 \ dnv.9 dnvlist_take_number.9 \ dnv.9 dnvlist_take_nvlist.9 \ dnv.9 dnvlist_take_string.9 MLINKS+=domain.9 DOMAIN_SET.9 \ domain.9 domain_add.9 \ domain.9 pfctlinput.9 \ domain.9 pfctlinput2.9 \ domain.9 pffinddomain.9 \ domain.9 pffindproto.9 \ domain.9 pffindtype.9 MLINKS+=drbr.9 drbr_free.9 \ drbr.9 drbr_enqueue.9 \ drbr.9 drbr_dequeue.9 \ drbr.9 drbr_dequeue_cond.9 \ drbr.9 drbr_flush.9 \ drbr.9 drbr_empty.9 \ drbr.9 drbr_inuse.9 \ drbr.9 drbr_stats_update.9 MLINKS+=DRIVER_MODULE.9 DRIVER_MODULE_ORDERED.9 \ DRIVER_MODULE.9 EARLY_DRIVER_MODULE.9 \ DRIVER_MODULE.9 EARLY_DRIVER_MODULE_ORDERED.9 MLINKS+=epoch.9 epoch_context.9 \ epoch.9 epoch_alloc.9 \ epoch.9 epoch_free.9 \ epoch.9 epoch_enter.9 \ epoch.9 epoch_exit.9 \ epoch.9 epoch_wait.9 \ epoch.9 epoch_call.9 \ epoch.9 epoch_drain_callbacks.9 \ epoch.9 in_epoch.9 MLINKS+=EVENTHANDLER.9 EVENTHANDLER_DECLARE.9 \ EVENTHANDLER.9 EVENTHANDLER_DEFINE.9 \ EVENTHANDLER.9 EVENTHANDLER_DEREGISTER.9 \ EVENTHANDLER.9 eventhandler_deregister.9 \ EVENTHANDLER.9 eventhandler_find_list.9 \ EVENTHANDLER.9 EVENTHANDLER_INVOKE.9 \ EVENTHANDLER.9 eventhandler_prune_list.9 \ EVENTHANDLER.9 EVENTHANDLER_REGISTER.9 \ EVENTHANDLER.9 eventhandler_register.9 MLINKS+=eventtimers.9 et_register.9 \ eventtimers.9 et_deregister.9 \ eventtimers.9 et_ban.9 \ eventtimers.9 et_find.9 \ eventtimers.9 et_free.9 \ eventtimers.9 et_init.9 \ eventtimers.9 ET_LOCK.9 \ eventtimers.9 ET_UNLOCK.9 \ eventtimers.9 et_start.9 \ eventtimers.9 et_stop.9 MLINKS+=fail.9 KFAIL_POINT_CODE.9 \ fail.9 KFAIL_POINT_ERROR.9 \ fail.9 KFAIL_POINT_GOTO.9 \ fail.9 KFAIL_POINT_RETURN.9 \ fail.9 KFAIL_POINT_RETURN_VOID.9 MLINKS+=fdt_pinctrl.9 fdt_pinctrl_configure.9 \ fdt_pinctrl.9 fdt_pinctrl_configure_by_name.9 \ fdt_pinctrl.9 fdt_pinctrl_configure_tree.9 \ fdt_pinctrl.9 fdt_pinctrl_register.9 MLINKS+=fetch.9 fubyte.9 \ fetch.9 fuword.9 \ fetch.9 fuword16.9 \ fetch.9 fuword32.9 \ fetch.9 fuword64.9 \ fetch.9 fueword.9 \ fetch.9 fueword32.9 \ fetch.9 fueword64.9 MLINKS+=firmware.9 firmware_get.9 \ firmware.9 firmware_put.9 \ firmware.9 firmware_register.9 \ firmware.9 firmware_unregister.9 MLINKS+=fpu_kern.9 fpu_kern_alloc_ctx.9 \ fpu_kern.9 fpu_kern_free_ctx.9 \ fpu_kern.9 fpu_kern_enter.9 \ fpu_kern.9 fpu_kern_leave.9 \ fpu_kern.9 fpu_kern_thread.9 \ fpu_kern.9 is_fpu_kern_thread.9 MLINKS+=g_attach.9 g_detach.9 MLINKS+=g_bio.9 g_alloc_bio.9 \ g_bio.9 g_clone_bio.9 \ g_bio.9 g_destroy_bio.9 \ g_bio.9 g_duplicate_bio.9 \ + g_bio.9 g_format_bio.9 \ g_bio.9 g_new_bio.9 \ g_bio.9 g_print_bio.9 \ g_bio.9 g_reset_bio.9 MLINKS+=g_consumer.9 g_destroy_consumer.9 \ g_consumer.9 g_new_consumer.9 MLINKS+=g_data.9 g_read_data.9 \ g_data.9 g_write_data.9 MLINKS+=getenv.9 freeenv.9 \ getenv.9 getenv_int.9 \ getenv.9 getenv_long.9 \ getenv.9 getenv_string.9 \ getenv.9 getenv_quad.9 \ getenv.9 getenv_uint.9 \ getenv.9 getenv_ulong.9 \ getenv.9 kern_getenv.9 \ getenv.9 kern_setenv.9 \ getenv.9 kern_unsetenv.9 \ getenv.9 setenv.9 \ getenv.9 testenv.9 \ getenv.9 unsetenv.9 MLINKS+=g_event.9 g_cancel_event.9 \ g_event.9 g_post_event.9 \ g_event.9 g_waitfor_event.9 MLINKS+=g_geom.9 g_destroy_geom.9 \ g_geom.9 g_new_geomf.9 MLINKS+=g_provider.9 g_destroy_provider.9 \ g_provider.9 g_error_provider.9 \ g_provider.9 g_new_providerf.9 MLINKS+=hash.9 hash32.9 \ hash.9 hash32_buf.9 \ hash.9 hash32_str.9 \ hash.9 hash32_stre.9 \ hash.9 hash32_strn.9 \ hash.9 hash32_strne.9 \ hash.9 jenkins_hash.9 \ hash.9 jenkins_hash32.9 MLINKS+=hashinit.9 hashdestroy.9 \ hashinit.9 hashinit_flags.9 \ hashinit.9 phashinit.9 MLINKS+=hhook.9 hhook_head_register.9 \ hhook.9 hhook_head_deregister.9 \ hhook.9 hhook_head_deregister_lookup.9 \ hhook.9 hhook_run_hooks.9 \ hhook.9 HHOOKS_RUN_IF.9 \ hhook.9 HHOOKS_RUN_LOOKUP_IF.9 MLINKS+=ieee80211.9 ieee80211_ifattach.9 \ ieee80211.9 ieee80211_ifdetach.9 MLINKS+=ieee80211_amrr.9 ieee80211_amrr_choose.9 \ ieee80211_amrr.9 ieee80211_amrr_cleanup.9 \ ieee80211_amrr.9 ieee80211_amrr_init.9 \ ieee80211_amrr.9 ieee80211_amrr_node_init.9 \ ieee80211_amrr.9 ieee80211_amrr_setinterval.9 \ ieee80211_amrr.9 ieee80211_amrr_tx_complete.9 \ ieee80211_amrr.9 ieee80211_amrr_tx_update.9 MLINKS+=ieee80211_beacon.9 ieee80211_beacon_alloc.9 \ ieee80211_beacon.9 ieee80211_beacon_notify.9 \ ieee80211_beacon.9 ieee80211_beacon_update.9 MLINKS+=ieee80211_bmiss.9 ieee80211_beacon_miss.9 MLINKS+=ieee80211_crypto.9 ieee80211_crypto_available.9 \ ieee80211_crypto.9 ieee80211_crypto_decap.9 \ ieee80211_crypto.9 ieee80211_crypto_delglobalkeys.9 \ ieee80211_crypto.9 ieee80211_crypto_delkey.9 \ ieee80211_crypto.9 ieee80211_crypto_demic.9 \ ieee80211_crypto.9 ieee80211_crypto_encap.9 \ ieee80211_crypto.9 ieee80211_crypto_enmic.9 \ ieee80211_crypto.9 ieee80211_crypto_newkey.9 \ ieee80211_crypto.9 ieee80211_crypto_register.9 \ ieee80211_crypto.9 ieee80211_crypto_reload_keys.9 \ ieee80211_crypto.9 ieee80211_crypto_setkey.9 \ ieee80211_crypto.9 ieee80211_crypto_unregister.9 \ ieee80211_crypto.9 ieee80211_key_update_begin.9 \ ieee80211_crypto.9 ieee80211_key_update_end.9 \ ieee80211_crypto.9 ieee80211_notify_michael_failure.9 \ ieee80211_crypto.9 ieee80211_notify_replay_failure.9 MLINKS+=ieee80211_input.9 ieee80211_input_all.9 MLINKS+=ieee80211_node.9 ieee80211_dump_node.9 \ ieee80211_node.9 ieee80211_dump_nodes.9 \ ieee80211_node.9 ieee80211_find_rxnode.9 \ ieee80211_node.9 ieee80211_find_rxnode_withkey.9 \ ieee80211_node.9 ieee80211_free_node.9 \ ieee80211_node.9 ieee80211_iterate_nodes.9 \ ieee80211_node.9 ieee80211_ref_node.9 \ ieee80211_node.9 ieee80211_unref_node.9 MLINKS+=ieee80211_output.9 ieee80211_process_callback.9 \ ieee80211_output.9 M_SEQNO_GET.9 \ ieee80211_output.9 M_WME_GETAC.9 MLINKS+=ieee80211_proto.9 ieee80211_new_state.9 \ ieee80211_proto.9 ieee80211_resume_all.9 \ ieee80211_proto.9 ieee80211_start_all.9 \ ieee80211_proto.9 ieee80211_stop_all.9 \ ieee80211_proto.9 ieee80211_suspend_all.9 \ ieee80211_proto.9 ieee80211_waitfor_parent.9 MLINKS+=ieee80211_radiotap.9 ieee80211_radiotap_active.9 \ ieee80211_radiotap.9 ieee80211_radiotap_active_vap.9 \ ieee80211_radiotap.9 ieee80211_radiotap_attach.9 \ ieee80211_radiotap.9 ieee80211_radiotap_tx.9 \ ieee80211_radiotap.9 radiotap.9 MLINKS+=ieee80211_regdomain.9 ieee80211_alloc_countryie.9 \ ieee80211_regdomain.9 ieee80211_init_channels.9 \ ieee80211_regdomain.9 ieee80211_sort_channels.9 MLINKS+=ieee80211_scan.9 ieee80211_add_scan.9 \ ieee80211_scan.9 ieee80211_bg_scan.9 \ ieee80211_scan.9 ieee80211_cancel_scan.9 \ ieee80211_scan.9 ieee80211_cancel_scan_any.9 \ ieee80211_scan.9 ieee80211_check_scan.9 \ ieee80211_scan.9 ieee80211_check_scan_current.9 \ ieee80211_scan.9 ieee80211_flush.9 \ ieee80211_scan.9 ieee80211_probe_curchan.9 \ ieee80211_scan.9 ieee80211_scan_assoc_fail.9 \ ieee80211_scan.9 ieee80211_scan_done.9 \ ieee80211_scan.9 ieee80211_scan_dump_channels.9 \ ieee80211_scan.9 ieee80211_scan_flush.9 \ ieee80211_scan.9 ieee80211_scan_iterate.9 \ ieee80211_scan.9 ieee80211_scan_next.9 \ ieee80211_scan.9 ieee80211_scan_timeout.9 \ ieee80211_scan.9 ieee80211_scanner_get.9 \ ieee80211_scan.9 ieee80211_scanner_register.9 \ ieee80211_scan.9 ieee80211_scanner_unregister.9 \ ieee80211_scan.9 ieee80211_scanner_unregister_all.9 \ ieee80211_scan.9 ieee80211_start_scan.9 MLINKS+=ieee80211_vap.9 ieee80211_vap_attach.9 \ ieee80211_vap.9 ieee80211_vap_detach.9 \ ieee80211_vap.9 ieee80211_vap_setup.9 MLINKS+=iflibdd.9 ifdi_attach_pre.9 \ iflibdd.9 ifdi_attach_post.9 \ iflibdd.9 ifdi_detach.9 \ iflibdd.9 ifdi_get_counter.9 \ iflibdd.9 ifdi_i2c_req.9 \ iflibdd.9 ifdi_init.9 \ iflibdd.9 ifdi_intr_enable.9 \ iflibdd.9 ifdi_intr_disable.9 \ iflibdd.9 ifdi_led_func.9 \ iflibdd.9 ifdi_link_intr_enable.9 \ iflibdd.9 ifdi_media_set.9 \ iflibdd.9 ifdi_media_status.9 \ iflibdd.9 ifdi_media_change.9 \ iflibdd.9 ifdi_mtu_set.9 \ iflibdd.9 ifdi_multi_set.9 \ iflibdd.9 ifdi_promisc_set.9 \ iflibdd.9 ifdi_queues_alloc.9 \ iflibdd.9 ifdi_queues_free.9 \ iflibdd.9 ifdi_queue_intr_enable.9 \ iflibdd.9 ifdi_resume.9 \ iflibdd.9 ifdi_rxq_setup.9 \ iflibdd.9 ifdi_stop.9 \ iflibdd.9 ifdi_suspend.9 \ iflibdd.9 ifdi_sysctl_int_delay.9 \ iflibdd.9 ifdi_timer.9 \ iflibdd.9 ifdi_txq_setup.9 \ iflibdd.9 ifdi_update_admin_status.9 \ iflibdd.9 ifdi_vf_add.9 \ iflibdd.9 ifdi_vflr_handle.9 \ iflibdd.9 ifdi_vlan_register.9 \ iflibdd.9 ifdi_vlan_unregister.9 \ iflibdd.9 ifdi_watchdog_reset.9 \ iflibdd.9 iov_init.9 \ iflibdd.9 iov_uinit.9 MLINKS+=iflibdi.9 iflib_add_int_delay_sysctl.9 \ iflibdi.9 iflib_device_attach.9 \ iflibdi.9 iflib_device_deregister.9 \ iflibdi.9 iflib_device_detach.9 \ iflibdi.9 iflib_device_suspend.9 \ iflibdi.9 iflib_device_register.9 \ iflibdi.9 iflib_device_resume.9 \ iflibdi.9 iflib_led_create.9 \ iflibdi.9 iflib_irq_alloc.9 \ iflibdi.9 iflib_irq_alloc_generic.9 \ iflibdi.9 iflib_link_intr_deferred.9 \ iflibdi.9 iflib_link_state_change.9 \ iflibdi.9 iflib_rx_intr_deferred.9 \ iflibdi.9 iflib_tx_intr_deferred.9 MLINKS+=iflibtxrx.9 isc_rxd_available.9 \ iflibtxrx.9 isc_rxd_refill.9 \ iflibtxrx.9 isc_rxd_flush.9 \ iflibtxrx.9 isc_rxd_pkt_get.9 \ iflibtxrx.9 isc_txd_credits_update.9 \ iflibtxrx.9 isc_txd_encap.9 \ iflibtxrx.9 isc_txd_flush.9 MLINKS+=ifnet.9 if_addmulti.9 \ ifnet.9 if_alloc.9 \ ifnet.9 if_alloc_dev.9 \ ifnet.9 if_alloc_domain.9 \ ifnet.9 if_allmulti.9 \ ifnet.9 if_attach.9 \ ifnet.9 if_data.9 \ ifnet.9 IF_DEQUEUE.9 \ ifnet.9 if_delmulti.9 \ ifnet.9 if_detach.9 \ ifnet.9 if_down.9 \ ifnet.9 if_findmulti.9 \ ifnet.9 if_free.9 \ ifnet.9 if_free_type.9 \ ifnet.9 if_up.9 \ ifnet.9 ifa_free.9 \ ifnet.9 ifa_ifwithaddr.9 \ ifnet.9 ifa_ifwithdstaddr.9 \ ifnet.9 ifa_ifwithnet.9 \ ifnet.9 ifa_ref.9 \ ifnet.9 ifaddr.9 \ ifnet.9 ifaddr_byindex.9 \ ifnet.9 ifaof_ifpforaddr.9 \ ifnet.9 ifioctl.9 \ ifnet.9 ifpromisc.9 \ ifnet.9 ifqueue.9 \ ifnet.9 ifunit.9 \ ifnet.9 ifunit_ref.9 MLINKS+=insmntque.9 insmntque1.9 MLINKS+=ithread.9 ithread_add_handler.9 \ ithread.9 ithread_create.9 \ ithread.9 ithread_destroy.9 \ ithread.9 ithread_priority.9 \ ithread.9 ithread_remove_handler.9 \ ithread.9 ithread_schedule.9 MLINKS+=kernacc.9 useracc.9 MLINKS+=kernel_mount.9 free_mntarg.9 \ kernel_mount.9 kernel_vmount.9 \ kernel_mount.9 mount_arg.9 \ kernel_mount.9 mount_argb.9 \ kernel_mount.9 mount_argf.9 \ kernel_mount.9 mount_argsu.9 MLINKS+=khelp.9 khelp_add_hhook.9 \ khelp.9 KHELP_DECLARE_MOD.9 \ khelp.9 KHELP_DECLARE_MOD_UMA.9 \ khelp.9 khelp_destroy_osd.9 \ khelp.9 khelp_get_id.9 \ khelp.9 khelp_get_osd.9 \ khelp.9 khelp_init_osd.9 \ khelp.9 khelp_remove_hhook.9 MLINKS+=kobj.9 DEFINE_CLASS.9 \ kobj.9 kobj_class_compile.9 \ kobj.9 kobj_class_compile_static.9 \ kobj.9 kobj_class_free.9 \ kobj.9 kobj_create.9 \ kobj.9 kobj_delete.9 \ kobj.9 kobj_init.9 \ kobj.9 kobj_init_static.9 MLINKS+=kproc.9 kproc_create.9 \ kproc.9 kproc_exit.9 \ kproc.9 kproc_kthread_add.9 \ kproc.9 kproc_resume.9 \ kproc.9 kproc_shutdown.9 \ kproc.9 kproc_start.9 \ kproc.9 kproc_suspend.9 \ kproc.9 kproc_suspend_check.9 \ kproc.9 kthread_create.9 MLINKS+=kqueue.9 knlist_add.9 \ kqueue.9 knlist_clear.9 \ kqueue.9 knlist_delete.9 \ kqueue.9 knlist_destroy.9 \ kqueue.9 knlist_empty.9 \ kqueue.9 knlist_init.9 \ kqueue.9 knlist_init_mtx.9 \ kqueue.9 knlist_init_rw_reader.9 \ kqueue.9 knlist_remove.9 \ kqueue.9 knlist_remove_inevent.9 \ kqueue.9 knote_fdclose.9 \ kqueue.9 KNOTE_LOCKED.9 \ kqueue.9 KNOTE_UNLOCKED.9 \ kqueue.9 kqfd_register.9 \ kqueue.9 kqueue_add_filteropts.9 \ kqueue.9 kqueue_del_filteropts.9 MLINKS+=kthread.9 kthread_add.9 \ kthread.9 kthread_exit.9 \ kthread.9 kthread_resume.9 \ kthread.9 kthread_shutdown.9 \ kthread.9 kthread_start.9 \ kthread.9 kthread_suspend.9 \ kthread.9 kthread_suspend_check.9 MLINKS+=ktr.9 CTR0.9 \ ktr.9 CTR1.9 \ ktr.9 CTR2.9 \ ktr.9 CTR3.9 \ ktr.9 CTR4.9 \ ktr.9 CTR5.9 \ ktr.9 CTR6.9 MLINKS+=lock.9 lockdestroy.9 \ lock.9 lockinit.9 \ lock.9 lockmgr.9 \ lock.9 lockmgr_args.9 \ lock.9 lockmgr_args_rw.9 \ lock.9 lockmgr_assert.9 \ lock.9 lockmgr_disown.9 \ lock.9 lockmgr_printinfo.9 \ lock.9 lockmgr_recursed.9 \ lock.9 lockmgr_rw.9 \ lock.9 lockstatus.9 MLINKS+=LOCK_PROFILING.9 MUTEX_PROFILING.9 MLINKS+=make_dev.9 destroy_dev.9 \ make_dev.9 destroy_dev_drain.9 \ make_dev.9 destroy_dev_sched.9 \ make_dev.9 destroy_dev_sched_cb.9 \ make_dev.9 dev_depends.9 \ make_dev.9 make_dev_alias.9 \ make_dev.9 make_dev_alias_p.9 \ make_dev.9 make_dev_cred.9 \ make_dev.9 make_dev_credf.9 \ make_dev.9 make_dev_p.9 \ make_dev.9 make_dev_s.9 MLINKS+=malloc.9 free.9 \ malloc.9 malloc_domainset.9 \ malloc.9 free_domain.9 \ malloc.9 mallocarray.9 \ malloc.9 MALLOC_DECLARE.9 \ malloc.9 MALLOC_DEFINE.9 \ malloc.9 realloc.9 \ malloc.9 reallocf.9 MLINKS+=mbchain.9 mb_detach.9 \ mbchain.9 mb_done.9 \ mbchain.9 mb_fixhdr.9 \ mbchain.9 mb_init.9 \ mbchain.9 mb_initm.9 \ mbchain.9 mb_put_int64be.9 \ mbchain.9 mb_put_int64le.9 \ mbchain.9 mb_put_mbuf.9 \ mbchain.9 mb_put_mem.9 \ mbchain.9 mb_put_uint16be.9 \ mbchain.9 mb_put_uint16le.9 \ mbchain.9 mb_put_uint32be.9 \ mbchain.9 mb_put_uint32le.9 \ mbchain.9 mb_put_uint8.9 \ mbchain.9 mb_put_uio.9 \ mbchain.9 mb_reserve.9 MLINKS+=\ mbuf.9 m_adj.9 \ mbuf.9 m_align.9 \ mbuf.9 M_ALIGN.9 \ mbuf.9 m_append.9 \ mbuf.9 m_apply.9 \ mbuf.9 m_cat.9 \ mbuf.9 m_catpkt.9 \ mbuf.9 MCHTYPE.9 \ mbuf.9 MCLGET.9 \ mbuf.9 m_collapse.9 \ mbuf.9 m_copyback.9 \ mbuf.9 m_copydata.9 \ mbuf.9 m_copym.9 \ mbuf.9 m_copypacket.9 \ mbuf.9 m_copyup.9 \ mbuf.9 m_defrag.9 \ mbuf.9 m_devget.9 \ mbuf.9 m_dup.9 \ mbuf.9 m_dup_pkthdr.9 \ mbuf.9 MEXTADD.9 \ mbuf.9 m_fixhdr.9 \ mbuf.9 m_free.9 \ mbuf.9 m_freem.9 \ mbuf.9 MGET.9 \ mbuf.9 m_get.9 \ mbuf.9 m_get2.9 \ mbuf.9 m_getjcl.9 \ mbuf.9 m_getcl.9 \ mbuf.9 MGETHDR.9 \ mbuf.9 m_gethdr.9 \ mbuf.9 m_getm.9 \ mbuf.9 m_getptr.9 \ mbuf.9 MH_ALIGN.9 \ mbuf.9 M_LEADINGSPACE.9 \ mbuf.9 m_length.9 \ mbuf.9 M_MOVE_PKTHDR.9 \ mbuf.9 m_move_pkthdr.9 \ mbuf.9 M_PREPEND.9 \ mbuf.9 m_prepend.9 \ mbuf.9 m_pulldown.9 \ mbuf.9 m_pullup.9 \ mbuf.9 m_split.9 \ mbuf.9 mtod.9 \ mbuf.9 M_TRAILINGSPACE.9 \ mbuf.9 m_unshare.9 \ mbuf.9 M_WRITABLE.9 MLINKS+=\ mbuf_tags.9 m_tag_alloc.9 \ mbuf_tags.9 m_tag_copy.9 \ mbuf_tags.9 m_tag_copy_chain.9 \ mbuf_tags.9 m_tag_delete.9 \ mbuf_tags.9 m_tag_delete_chain.9 \ mbuf_tags.9 m_tag_delete_nonpersistent.9 \ mbuf_tags.9 m_tag_find.9 \ mbuf_tags.9 m_tag_first.9 \ mbuf_tags.9 m_tag_free.9 \ mbuf_tags.9 m_tag_get.9 \ mbuf_tags.9 m_tag_init.9 \ mbuf_tags.9 m_tag_locate.9 \ mbuf_tags.9 m_tag_next.9 \ mbuf_tags.9 m_tag_prepend.9 \ mbuf_tags.9 m_tag_unlink.9 MLINKS+=MD5.9 MD5Init.9 \ MD5.9 MD5Transform.9 MLINKS+=mdchain.9 md_append_record.9 \ mdchain.9 md_done.9 \ mdchain.9 md_get_int64.9 \ mdchain.9 md_get_int64be.9 \ mdchain.9 md_get_int64le.9 \ mdchain.9 md_get_mbuf.9 \ mdchain.9 md_get_mem.9 \ mdchain.9 md_get_uint16.9 \ mdchain.9 md_get_uint16be.9 \ mdchain.9 md_get_uint16le.9 \ mdchain.9 md_get_uint32.9 \ mdchain.9 md_get_uint32be.9 \ mdchain.9 md_get_uint32le.9 \ mdchain.9 md_get_uint8.9 \ mdchain.9 md_get_uio.9 \ mdchain.9 md_initm.9 \ mdchain.9 md_next_record.9 MLINKS+=microtime.9 bintime.9 \ microtime.9 getbintime.9 \ microtime.9 getmicrotime.9 \ microtime.9 getnanotime.9 \ microtime.9 nanotime.9 MLINKS+=microuptime.9 binuptime.9 \ microuptime.9 getbinuptime.9 \ microuptime.9 getmicrouptime.9 \ microuptime.9 getnanouptime.9 \ microuptime.9 getsbinuptime.9 \ microuptime.9 nanouptime.9 \ microuptime.9 sbinuptime.9 MLINKS+=mi_switch.9 cpu_switch.9 \ mi_switch.9 cpu_throw.9 MLINKS+=mod_cc.9 CCV.9 \ mod_cc.9 DECLARE_CC_MODULE.9 MLINKS+=mtx_pool.9 mtx_pool_alloc.9 \ mtx_pool.9 mtx_pool_create.9 \ mtx_pool.9 mtx_pool_destroy.9 \ mtx_pool.9 mtx_pool_find.9 \ mtx_pool.9 mtx_pool_lock.9 \ mtx_pool.9 mtx_pool_lock_spin.9 \ mtx_pool.9 mtx_pool_unlock.9 \ mtx_pool.9 mtx_pool_unlock_spin.9 MLINKS+=mutex.9 mtx_assert.9 \ mutex.9 mtx_destroy.9 \ mutex.9 mtx_init.9 \ mutex.9 mtx_initialized.9 \ mutex.9 mtx_lock.9 \ mutex.9 mtx_lock_flags.9 \ mutex.9 mtx_lock_spin.9 \ mutex.9 mtx_lock_spin_flags.9 \ mutex.9 mtx_owned.9 \ mutex.9 mtx_recursed.9 \ mutex.9 mtx_sleep.9 \ mutex.9 MTX_SYSINIT.9 \ mutex.9 mtx_trylock.9 \ mutex.9 mtx_trylock_flags.9 \ mutex.9 mtx_trylock_spin.9 \ mutex.9 mtx_trylock_spin_flags.9 \ mutex.9 mtx_unlock.9 \ mutex.9 mtx_unlock_flags.9 \ mutex.9 mtx_unlock_spin.9 \ mutex.9 mtx_unlock_spin_flags.9 MLINKS+=namei.9 NDFREE.9 \ namei.9 NDINIT.9 MLINKS+=netisr.9 netisr_clearqdrops.9 \ netisr.9 netisr_default_flow2cpu.9 \ netisr.9 netisr_dispatch.9 \ netisr.9 netisr_dispatch_src.9 \ netisr.9 netisr_get_cpucount.9 \ netisr.9 netisr_get_cpuid.9 \ netisr.9 netisr_getqdrops.9 \ netisr.9 netisr_getqlimit.9 \ netisr.9 netisr_queue.9 \ netisr.9 netisr_queue_src.9 \ netisr.9 netisr_register.9 \ netisr.9 netisr_setqlimit.9 \ netisr.9 netisr_unregister.9 MLINKS+=nv.9 libnv.9 \ nv.9 nvlist.9 \ nv.9 nvlist_add_binary.9 \ nv.9 nvlist_add_bool.9 \ nv.9 nvlist_add_bool_array.9 \ nv.9 nvlist_add_descriptor.9 \ nv.9 nvlist_add_descriptor_array.9 \ nv.9 nvlist_add_null.9 \ nv.9 nvlist_add_number.9 \ nv.9 nvlist_add_number_array.9 \ nv.9 nvlist_add_nvlist.9 \ nv.9 nvlist_add_nvlist_array.9 \ nv.9 nvlist_add_string.9 \ nv.9 nvlist_add_stringf.9 \ nv.9 nvlist_add_stringv.9 \ nv.9 nvlist_add_string_array.9 \ nv.9 nvlist_clone.9 \ nv.9 nvlist_create.9 \ nv.9 nvlist_destroy.9 \ nv.9 nvlist_dump.9 \ nv.9 nvlist_empty.9 \ nv.9 nvlist_error.9 \ nv.9 nvlist_exists.9 \ nv.9 nvlist_exists_binary.9 \ nv.9 nvlist_exists_bool.9 \ nv.9 nvlist_exists_bool_array.9 \ nv.9 nvlist_exists_descriptor.9 \ nv.9 nvlist_exists_descriptor_array.9 \ nv.9 nvlist_exists_null.9 \ nv.9 nvlist_exists_number.9 \ nv.9 nvlist_exists_number_array.9 \ nv.9 nvlist_exists_nvlist.9 \ nv.9 nvlist_exists_nvlist_array.9 \ nv.9 nvlist_exists_string.9 \ nv.9 nvlist_exists_type.9 \ nv.9 nvlist_fdump.9 \ nv.9 nvlist_flags.9 \ nv.9 nvlist_free.9 \ nv.9 nvlist_free_binary.9 \ nv.9 nvlist_free_bool.9 \ nv.9 nvlist_free_bool_array.9 \ nv.9 nvlist_free_descriptor.9 \ nv.9 nvlist_free_descriptor_array.9 \ nv.9 nvlist_free_null.9 \ nv.9 nvlist_free_number.9 \ nv.9 nvlist_free_number_array.9 \ nv.9 nvlist_free_nvlist.9 \ nv.9 nvlist_free_nvlist_array.9 \ nv.9 nvlist_free_string.9 \ nv.9 nvlist_free_string_array.9 \ nv.9 nvlist_free_type.9 \ nv.9 nvlist_get_binary.9 \ nv.9 nvlist_get_bool.9 \ nv.9 nvlist_get_bool_array.9 \ nv.9 nvlist_get_descriptor.9 \ nv.9 nvlist_get_descriptor_array.9 \ nv.9 nvlist_get_number.9 \ nv.9 nvlist_get_number_array.9 \ nv.9 nvlist_get_nvlist.9 \ nv.9 nvlist_get_nvlist_array.9 \ nv.9 nvlist_get_parent.9 \ nv.9 nvlist_get_string.9 \ nv.9 nvlist_get_string_array.9 \ nv.9 nvlist_move_binary.9 \ nv.9 nvlist_move_descriptor.9 \ nv.9 nvlist_move_descriptor_array.9 \ nv.9 nvlist_move_nvlist.9 \ nv.9 nvlist_move_nvlist_array.9 \ nv.9 nvlist_move_string.9 \ nv.9 nvlist_move_string_array.9 \ nv.9 nvlist_next.9 \ nv.9 nvlist_pack.9 \ nv.9 nvlist_recv.9 \ nv.9 nvlist_send.9 \ nv.9 nvlist_set_error.9 \ nv.9 nvlist_size.9 \ nv.9 nvlist_take_binary.9 \ nv.9 nvlist_take_bool.9 \ nv.9 nvlist_take_bool_array.9 \ nv.9 nvlist_take_descriptor.9 \ nv.9 nvlist_take_descriptor_array.9 \ nv.9 nvlist_take_number.9 \ nv.9 nvlist_take_number_array.9 \ nv.9 nvlist_take_nvlist.9 \ nv.9 nvlist_take_nvlist_array.9 \ nv.9 nvlist_take_string.9 \ nv.9 nvlist_take_string_array.9 \ nv.9 nvlist_unpack.9 \ nv.9 nvlist_xfer.9 MLINKS+=OF_child.9 OF_parent.9 \ OF_child.9 OF_peer.9 MLINKS+=OF_device_from_xref.9 OF_device_register_xref.9 \ OF_device_from_xref.9 OF_xref_from_device.9 MLINKS+=OF_getprop.9 OF_getencprop.9 \ OF_getprop.9 OF_getencprop_alloc.9 \ OF_getprop.9 OF_getencprop_alloc_multi.9 \ OF_getprop.9 OF_getprop_alloc.9 \ OF_getprop.9 OF_getprop_alloc_multi.9 \ OF_getprop.9 OF_getproplen.9 \ OF_getprop.9 OF_hasprop.9 \ OF_getprop.9 OF_nextprop.9 \ OF_getprop.9 OF_prop_free.9 \ OF_getprop.9 OF_searchencprop.9 \ OF_getprop.9 OF_searchprop.9 \ OF_getprop.9 OF_setprop.9 MLINKS+=OF_node_from_xref.9 OF_xref_from_node.9 MLINKS+=ofw_bus_is_compatible.9 ofw_bus_is_compatible_strict.9 \ ofw_bus_is_compatible.9 ofw_bus_node_is_compatible.9 \ ofw_bus_is_compatible.9 ofw_bus_search_compatible.9 MLINKS+= ofw_bus_status_okay.9 ofw_bus_get_status.9 \ ofw_bus_status_okay.9 ofw_bus_node_status_okay.9 MLINKS+=osd.9 osd_call.9 \ osd.9 osd_del.9 \ osd.9 osd_deregister.9 \ osd.9 osd_exit.9 \ osd.9 osd_get.9 \ osd.9 osd_register.9 \ osd.9 osd_set.9 MLINKS+=panic.9 vpanic.9 MLINKS+=PCBGROUP.9 in_pcbgroup_byhash.9 \ PCBGROUP.9 in_pcbgroup_byinpcb.9 \ PCBGROUP.9 in_pcbgroup_destroy.9 \ PCBGROUP.9 in_pcbgroup_enabled.9 \ PCBGROUP.9 in_pcbgroup_init.9 \ PCBGROUP.9 in_pcbgroup_remove.9 \ PCBGROUP.9 in_pcbgroup_update.9 \ PCBGROUP.9 in_pcbgroup_update_mbuf.9 \ PCBGROUP.9 in6_pcbgroup_byhash.9 MLINKS+=pci.9 pci_alloc_msi.9 \ pci.9 pci_alloc_msix.9 \ pci.9 pci_disable_busmaster.9 \ pci.9 pci_disable_io.9 \ pci.9 pci_enable_busmaster.9 \ pci.9 pci_enable_io.9 \ pci.9 pci_find_bsf.9 \ pci.9 pci_find_cap.9 \ pci.9 pci_find_dbsf.9 \ pci.9 pci_find_device.9 \ pci.9 pci_find_extcap.9 \ pci.9 pci_find_htcap.9 \ pci.9 pci_find_pcie_root_port.9 \ pci.9 pci_get_id.9 \ pci.9 pci_get_max_read_req.9 \ pci.9 pci_get_powerstate.9 \ pci.9 pci_get_vpd_ident.9 \ pci.9 pci_get_vpd_readonly.9 \ pci.9 pci_iov_attach.9 \ pci.9 pci_iov_attach_name.9 \ pci.9 pci_iov_detach.9 \ pci.9 pci_msi_count.9 \ pci.9 pci_msix_count.9 \ pci.9 pci_msix_pba_bar.9 \ pci.9 pci_msix_table_bar.9 \ pci.9 pci_pending_msix.9 \ pci.9 pci_read_config.9 \ pci.9 pci_release_msi.9 \ pci.9 pci_remap_msix.9 \ pci.9 pci_restore_state.9 \ pci.9 pci_save_state.9 \ pci.9 pci_set_powerstate.9 \ pci.9 pci_set_max_read_req.9 \ pci.9 pci_write_config.9 \ pci.9 pcie_adjust_config.9 \ pci.9 pcie_flr.9 \ pci.9 pcie_max_completion_timeout.9 \ pci.9 pcie_read_config.9 \ pci.9 pcie_wait_for_pending_transactions.9 \ pci.9 pcie_write_config.9 MLINKS+=pci_iov_schema.9 pci_iov_schema_alloc_node.9 \ pci_iov_schema.9 pci_iov_schema_add_bool.9 \ pci_iov_schema.9 pci_iov_schema_add_string.9 \ pci_iov_schema.9 pci_iov_schema_add_uint8.9 \ pci_iov_schema.9 pci_iov_schema_add_uint16.9 \ pci_iov_schema.9 pci_iov_schema_add_uint32.9 \ pci_iov_schema.9 pci_iov_schema_add_uint64.9 \ pci_iov_schema.9 pci_iov_schema_add_unicast_mac.9 MLINKS+=pfil.9 pfil_add_hook.9 \ pfil.9 pfil_head_register.9 \ pfil.9 pfil_head_unregister.9 \ pfil.9 pfil_remove_hook.9 \ pfil.9 pfil_run_hooks.9 \ pfil.9 pfil_link.9 MLINKS+=pfind.9 zpfind.9 MLINKS+=PHOLD.9 PRELE.9 \ PHOLD.9 _PHOLD.9 \ PHOLD.9 _PRELE.9 \ PHOLD.9 PROC_ASSERT_HELD.9 \ PHOLD.9 PROC_ASSERT_NOT_HELD.9 MLINKS+=pmap_copy.9 pmap_copy_page.9 MLINKS+=pmap_extract.9 pmap_extract_and_hold.9 MLINKS+=pmap_init.9 pmap_init2.9 MLINKS+=pmap_is_modified.9 pmap_ts_referenced.9 MLINKS+=pmap_pinit.9 pmap_pinit0.9 \ pmap_pinit.9 pmap_pinit2.9 MLINKS+=pmap_qenter.9 pmap_qremove.9 MLINKS+=pmap_quick_enter_page.9 pmap_quick_remove_page.9 MLINKS+=pmap_remove.9 pmap_remove_all.9 \ pmap_remove.9 pmap_remove_pages.9 MLINKS+=pmap_resident_count.9 pmap_wired_count.9 MLINKS+=pmap_zero_page.9 pmap_zero_area.9 MLINKS+=printf.9 log.9 \ printf.9 tprintf.9 \ printf.9 uprintf.9 MLINKS+=priv.9 priv_check.9 \ priv.9 priv_check_cred.9 MLINKS+=proc_rwmem.9 proc_readmem.9 \ proc_rwmem.9 proc_writemem.9 MLINKS+=psignal.9 gsignal.9 \ psignal.9 pgsignal.9 \ psignal.9 tdsignal.9 MLINKS+=pwmbus.9 pwm.9 MLINKS+=random.9 arc4rand.9 \ random.9 arc4random.9 \ random.9 is_random_seeded.9 \ random.9 read_random.9 \ random.9 read_random_uio.9 \ random.9 srandom.9 MLINKS+=random_harvest.9 random_harvest_direct.9 \ random_harvest.9 random_harvest_fast.9 \ random_harvest.9 random_harvest_queue.9 MLINKS+=ratecheck.9 ppsratecheck.9 MLINKS+=refcount.9 refcount_acquire.9 \ refcount.9 refcount_init.9 \ refcount.9 refcount_release.9 MLINKS+=resource_int_value.9 resource_long_value.9 \ resource_int_value.9 resource_string_value.9 MLINKS+=rman.9 rman_activate_resource.9 \ rman.9 rman_adjust_resource.9 \ rman.9 rman_deactivate_resource.9 \ rman.9 rman_fini.9 \ rman.9 rman_first_free_region.9 \ rman.9 rman_get_bushandle.9 \ rman.9 rman_get_bustag.9 \ rman.9 rman_get_device.9 \ rman.9 rman_get_end.9 \ rman.9 rman_get_flags.9 \ rman.9 rman_get_mapping.9 \ rman.9 rman_get_rid.9 \ rman.9 rman_get_size.9 \ rman.9 rman_get_start.9 \ rman.9 rman_get_virtual.9 \ rman.9 rman_init.9 \ rman.9 rman_init_from_resource.9 \ rman.9 rman_is_region_manager.9 \ rman.9 rman_last_free_region.9 \ rman.9 rman_make_alignment_flags.9 \ rman.9 rman_manage_region.9 \ rman.9 rman_release_resource.9 \ rman.9 rman_reserve_resource.9 \ rman.9 rman_reserve_resource_bound.9 \ rman.9 rman_set_bushandle.9 \ rman.9 rman_set_bustag.9 \ rman.9 rman_set_mapping.9 \ rman.9 rman_set_rid.9 \ rman.9 rman_set_virtual.9 MLINKS+=rmlock.9 rm_assert.9 \ rmlock.9 rm_destroy.9 \ rmlock.9 rm_init.9 \ rmlock.9 rm_init_flags.9 \ rmlock.9 rm_rlock.9 \ rmlock.9 rm_runlock.9 \ rmlock.9 rm_sleep.9 \ rmlock.9 RM_SYSINIT.9 \ rmlock.9 RM_SYSINIT_FLAGS.9 \ rmlock.9 rm_try_rlock.9 \ rmlock.9 rm_wlock.9 \ rmlock.9 rm_wowned.9 \ rmlock.9 rm_wunlock.9 MLINKS+=rtalloc.9 rtalloc1.9 \ rtalloc.9 rtalloc_ign.9 \ rtalloc.9 RT_ADDREF.9 \ rtalloc.9 RT_LOCK.9 \ rtalloc.9 RT_REMREF.9 \ rtalloc.9 RT_RTFREE.9 \ rtalloc.9 RT_UNLOCK.9 \ rtalloc.9 RTFREE_LOCKED.9 \ rtalloc.9 RTFREE.9 \ rtalloc.9 rtfree.9 \ rtalloc.9 rtalloc1_fib.9 \ rtalloc.9 rtalloc_ign_fib.9 \ rtalloc.9 rtalloc_fib.9 MLINKS+=runqueue.9 choosethread.9 \ runqueue.9 procrunnable.9 \ runqueue.9 remrunqueue.9 \ runqueue.9 setrunqueue.9 MLINKS+=rwlock.9 rw_assert.9 \ rwlock.9 rw_destroy.9 \ rwlock.9 rw_downgrade.9 \ rwlock.9 rw_init.9 \ rwlock.9 rw_init_flags.9 \ rwlock.9 rw_initialized.9 \ rwlock.9 rw_rlock.9 \ rwlock.9 rw_runlock.9 \ rwlock.9 rw_unlock.9 \ rwlock.9 rw_sleep.9 \ rwlock.9 RW_SYSINIT.9 \ rwlock.9 RW_SYSINIT_FLAGS.9 \ rwlock.9 rw_try_rlock.9 \ rwlock.9 rw_try_upgrade.9 \ rwlock.9 rw_try_wlock.9 \ rwlock.9 rw_wlock.9 \ rwlock.9 rw_wowned.9 \ rwlock.9 rw_wunlock.9 MLINKS+=sbuf.9 sbuf_bcat.9 \ sbuf.9 sbuf_bcopyin.9 \ sbuf.9 sbuf_bcpy.9 \ sbuf.9 sbuf_cat.9 \ sbuf.9 sbuf_clear.9 \ sbuf.9 sbuf_clear_flags.9 \ sbuf.9 sbuf_copyin.9 \ sbuf.9 sbuf_cpy.9 \ sbuf.9 sbuf_data.9 \ sbuf.9 sbuf_delete.9 \ sbuf.9 sbuf_done.9 \ sbuf.9 sbuf_error.9 \ sbuf.9 sbuf_finish.9 \ sbuf.9 sbuf_get_flags.9 \ sbuf.9 sbuf_hexdump.9 \ sbuf.9 sbuf_len.9 \ sbuf.9 sbuf_new.9 \ sbuf.9 sbuf_new_auto.9 \ sbuf.9 sbuf_new_for_sysctl.9 \ sbuf.9 sbuf_printf.9 \ sbuf.9 sbuf_printf_drain.9 \ sbuf.9 sbuf_putbuf.9 \ sbuf.9 sbuf_putc.9 \ sbuf.9 sbuf_set_drain.9 \ sbuf.9 sbuf_set_flags.9 \ sbuf.9 sbuf_setpos.9 \ sbuf.9 sbuf_start_section.9 \ sbuf.9 sbuf_end_section.9 \ sbuf.9 sbuf_trim.9 \ sbuf.9 sbuf_vprintf.9 MLINKS+=scheduler.9 curpriority_cmp.9 \ scheduler.9 maybe_resched.9 \ scheduler.9 propagate_priority.9 \ scheduler.9 resetpriority.9 \ scheduler.9 roundrobin.9 \ scheduler.9 roundrobin_interval.9 \ scheduler.9 schedclock.9 \ scheduler.9 schedcpu.9 \ scheduler.9 sched_setup.9 \ scheduler.9 setrunnable.9 \ scheduler.9 updatepri.9 MLINKS+=SDT.9 SDT_PROVIDER_DECLARE.9 \ SDT.9 SDT_PROVIDER_DEFINE.9 \ SDT.9 SDT_PROBE_DECLARE.9 \ SDT.9 SDT_PROBE_DEFINE.9 \ SDT.9 SDT_PROBE.9 MLINKS+=securelevel_gt.9 securelevel_ge.9 MLINKS+=selrecord.9 seldrain.9 \ selrecord.9 selwakeup.9 MLINKS+=sema.9 sema_destroy.9 \ sema.9 sema_init.9 \ sema.9 sema_post.9 \ sema.9 sema_timedwait.9 \ sema.9 sema_trywait.9 \ sema.9 sema_value.9 \ sema.9 sema_wait.9 MLINKS+=seqc.9 seqc_consistent.9 \ seqc.9 seqc_read.9 \ seqc.9 seqc_write_begin.9 \ seqc.9 seqc_write_end.9 MLINKS+=sf_buf.9 sf_buf_alloc.9 \ sf_buf.9 sf_buf_free.9 \ sf_buf.9 sf_buf_kva.9 \ sf_buf.9 sf_buf_page.9 MLINKS+=sglist.9 sglist_alloc.9 \ sglist.9 sglist_append.9 \ sglist.9 sglist_append_bio.9 \ sglist.9 sglist_append_ext_pgs.9 \ sglist.9 sglist_append_mb_ext_pgs.9 \ sglist.9 sglist_append_mbuf.9 \ sglist.9 sglist_append_phys.9 \ sglist.9 sglist_append_sglist.9 \ sglist.9 sglist_append_uio.9 \ sglist.9 sglist_append_user.9 \ sglist.9 sglist_append_vmpages.9 \ sglist.9 sglist_build.9 \ sglist.9 sglist_clone.9 \ sglist.9 sglist_consume_uio.9 \ sglist.9 sglist_count.9 \ sglist.9 sglist_count_ext_pgs.9 \ sglist.9 sglist_count_mb_ext_pgs.9 \ sglist.9 sglist_count_vmpages.9 \ sglist.9 sglist_free.9 \ sglist.9 sglist_hold.9 \ sglist.9 sglist_init.9 \ sglist.9 sglist_join.9 \ sglist.9 sglist_length.9 \ sglist.9 sglist_reset.9 \ sglist.9 sglist_slice.9 \ sglist.9 sglist_split.9 MLINKS+=shm_map.9 shm_unmap.9 MLINKS+=signal.9 cursig.9 \ signal.9 execsigs.9 \ signal.9 issignal.9 \ signal.9 killproc.9 \ signal.9 pgsigio.9 \ signal.9 postsig.9 \ signal.9 SETSETNEQ.9 \ signal.9 SETSETOR.9 \ signal.9 SIGADDSET.9 \ signal.9 SIG_CONTSIGMASK.9 \ signal.9 SIGDELSET.9 \ signal.9 SIGEMPTYSET.9 \ signal.9 sigexit.9 \ signal.9 SIGFILLSET.9 \ signal.9 siginit.9 \ signal.9 SIGISEMPTY.9 \ signal.9 SIGISMEMBER.9 \ signal.9 SIGNOTEMPTY.9 \ signal.9 signotify.9 \ signal.9 SIGPENDING.9 \ signal.9 SIGSETAND.9 \ signal.9 SIGSETCANTMASK.9 \ signal.9 SIGSETEQ.9 \ signal.9 SIGSETNAND.9 \ signal.9 SIG_STOPSIGMASK.9 \ signal.9 trapsignal.9 MLINKS+=sleep.9 msleep.9 \ sleep.9 msleep_sbt.9 \ sleep.9 msleep_spin.9 \ sleep.9 msleep_spin_sbt.9 \ sleep.9 pause.9 \ sleep.9 pause_sig.9 \ sleep.9 pause_sbt.9 \ sleep.9 tsleep.9 \ sleep.9 tsleep_sbt.9 \ sleep.9 wakeup.9 \ sleep.9 wakeup_one.9 \ sleep.9 wakeup_any.9 MLINKS+=sleepqueue.9 init_sleepqueues.9 \ sleepqueue.9 sleepq_abort.9 \ sleepqueue.9 sleepq_add.9 \ sleepqueue.9 sleepq_alloc.9 \ sleepqueue.9 sleepq_broadcast.9 \ sleepqueue.9 sleepq_free.9 \ sleepqueue.9 sleepq_lookup.9 \ sleepqueue.9 sleepq_lock.9 \ sleepqueue.9 sleepq_release.9 \ sleepqueue.9 sleepq_remove.9 \ sleepqueue.9 sleepq_set_timeout.9 \ sleepqueue.9 sleepq_set_timeout_sbt.9 \ sleepqueue.9 sleepq_signal.9 \ sleepqueue.9 sleepq_sleepcnt.9 \ sleepqueue.9 sleepq_timedwait.9 \ sleepqueue.9 sleepq_timedwait_sig.9 \ sleepqueue.9 sleepq_type.9 \ sleepqueue.9 sleepq_wait.9 \ sleepqueue.9 sleepq_wait_sig.9 MLINKS+=socket.9 soabort.9 \ socket.9 soaccept.9 \ socket.9 sobind.9 \ socket.9 socheckuid.9 \ socket.9 soclose.9 \ socket.9 soconnect.9 \ socket.9 socreate.9 \ socket.9 sodisconnect.9 \ socket.9 sodtor_set.9 \ socket.9 sodupsockaddr.9 \ socket.9 sofree.9 \ socket.9 sogetopt.9 \ socket.9 sohasoutofband.9 \ socket.9 solisten.9 \ socket.9 solisten_proto.9 \ socket.9 solisten_proto_check.9 \ socket.9 sonewconn.9 \ socket.9 sooptcopyin.9 \ socket.9 sooptcopyout.9 \ socket.9 sopoll.9 \ socket.9 sopoll_generic.9 \ socket.9 soreceive.9 \ socket.9 soreceive_dgram.9 \ socket.9 soreceive_generic.9 \ socket.9 soreceive_stream.9 \ socket.9 soreserve.9 \ socket.9 sorflush.9 \ socket.9 sosend.9 \ socket.9 sosend_dgram.9 \ socket.9 sosend_generic.9 \ socket.9 sosetopt.9 \ socket.9 soshutdown.9 \ socket.9 sotoxsocket.9 \ socket.9 soupcall_clear.9 \ socket.9 soupcall_set.9 \ socket.9 sowakeup.9 MLINKS+=stack.9 stack_copy.9 \ stack.9 stack_create.9 \ stack.9 stack_destroy.9 \ stack.9 stack_print.9 \ stack.9 stack_print_ddb.9 \ stack.9 stack_print_short.9 \ stack.9 stack_print_short_ddb.9 \ stack.9 stack_put.9 \ stack.9 stack_save.9 \ stack.9 stack_sbuf_print.9 \ stack.9 stack_sbuf_print_ddb.9 \ stack.9 stack_zero.9 MLINKS+=store.9 subyte.9 \ store.9 suword.9 \ store.9 suword16.9 \ store.9 suword32.9 \ store.9 suword64.9 MLINKS+=swi.9 swi_add.9 \ swi.9 swi_remove.9 \ swi.9 swi_sched.9 MLINKS+=sx.9 sx_assert.9 \ sx.9 sx_destroy.9 \ sx.9 sx_downgrade.9 \ sx.9 sx_init.9 \ sx.9 sx_init_flags.9 \ sx.9 sx_sleep.9 \ sx.9 sx_slock.9 \ sx.9 sx_slock_sig.9 \ sx.9 sx_sunlock.9 \ sx.9 SX_SYSINIT.9 \ sx.9 SX_SYSINIT_FLAGS.9 \ sx.9 sx_try_slock.9 \ sx.9 sx_try_upgrade.9 \ sx.9 sx_try_xlock.9 \ sx.9 sx_unlock.9 \ sx.9 sx_xholder.9 \ sx.9 sx_xlock.9 \ sx.9 sx_xlock_sig.9 \ sx.9 sx_xlocked.9 \ sx.9 sx_xunlock.9 MLINKS+=syscall_helper_register.9 syscall_helper_unregister.9 \ syscall_helper_register.9 SYSCALL_INIT_HELPER.9 \ syscall_helper_register.9 SYSCALL_INIT_HELPER_COMPAT.9 \ syscall_helper_register.9 SYSCALL_INIT_HELPER_COMPAT_F.9 \ syscall_helper_register.9 SYSCALL_INIT_HELPER_F.9 MLINKS+=sysctl.9 SYSCTL_DECL.9 \ sysctl.9 SYSCTL_ADD_INT.9 \ sysctl.9 SYSCTL_ADD_LONG.9 \ sysctl.9 SYSCTL_ADD_NODE.9 \ sysctl.9 SYSCTL_ADD_NODE_WITH_LABEL.9 \ sysctl.9 SYSCTL_ADD_OPAQUE.9 \ sysctl.9 SYSCTL_ADD_PROC.9 \ sysctl.9 SYSCTL_ADD_QUAD.9 \ sysctl.9 SYSCTL_ADD_ROOT_NODE.9 \ sysctl.9 SYSCTL_ADD_S8.9 \ sysctl.9 SYSCTL_ADD_S16.9 \ sysctl.9 SYSCTL_ADD_S32.9 \ sysctl.9 SYSCTL_ADD_S64.9 \ sysctl.9 SYSCTL_ADD_STRING.9 \ sysctl.9 SYSCTL_ADD_STRUCT.9 \ sysctl.9 SYSCTL_ADD_TIMEVAL_SEC.9 \ sysctl.9 SYSCTL_ADD_U8.9 \ sysctl.9 SYSCTL_ADD_U16.9 \ sysctl.9 SYSCTL_ADD_U32.9 \ sysctl.9 SYSCTL_ADD_U64.9 \ sysctl.9 SYSCTL_ADD_UAUTO.9 \ sysctl.9 SYSCTL_ADD_UINT.9 \ sysctl.9 SYSCTL_ADD_ULONG.9 \ sysctl.9 SYSCTL_ADD_UQUAD.9 \ sysctl.9 SYSCTL_CHILDREN.9 \ sysctl.9 SYSCTL_STATIC_CHILDREN.9 \ sysctl.9 SYSCTL_NODE_CHILDREN.9 \ sysctl.9 SYSCTL_PARENT.9 \ sysctl.9 SYSCTL_INT.9 \ sysctl.9 SYSCTL_INT_WITH_LABEL.9 \ sysctl.9 SYSCTL_LONG.9 \ sysctl.9 sysctl_msec_to_ticks.9 \ sysctl.9 SYSCTL_NODE.9 \ sysctl.9 SYSCTL_NODE_WITH_LABEL.9 \ sysctl.9 SYSCTL_OPAQUE.9 \ sysctl.9 SYSCTL_PROC.9 \ sysctl.9 SYSCTL_QUAD.9 \ sysctl.9 SYSCTL_ROOT_NODE.9 \ sysctl.9 SYSCTL_S8.9 \ sysctl.9 SYSCTL_S16.9 \ sysctl.9 SYSCTL_S32.9 \ sysctl.9 SYSCTL_S64.9 \ sysctl.9 SYSCTL_STRING.9 \ sysctl.9 SYSCTL_STRUCT.9 \ sysctl.9 SYSCTL_TIMEVAL_SEC.9 \ sysctl.9 SYSCTL_U8.9 \ sysctl.9 SYSCTL_U16.9 \ sysctl.9 SYSCTL_U32.9 \ sysctl.9 SYSCTL_U64.9 \ sysctl.9 SYSCTL_UINT.9 \ sysctl.9 SYSCTL_ULONG.9 \ sysctl.9 SYSCTL_UQUAD.9 MLINKS+=sysctl_add_oid.9 sysctl_move_oid.9 \ sysctl_add_oid.9 sysctl_remove_oid.9 \ sysctl_add_oid.9 sysctl_remove_name.9 MLINKS+=sysctl_ctx_init.9 sysctl_ctx_entry_add.9 \ sysctl_ctx_init.9 sysctl_ctx_entry_del.9 \ sysctl_ctx_init.9 sysctl_ctx_entry_find.9 \ sysctl_ctx_init.9 sysctl_ctx_free.9 MLINKS+=SYSINIT.9 SYSUNINIT.9 MLINKS+=taskqueue.9 TASK_INIT.9 \ taskqueue.9 TASK_INITIALIZER.9 \ taskqueue.9 taskqueue_block.9 \ taskqueue.9 taskqueue_cancel.9 \ taskqueue.9 taskqueue_cancel_timeout.9 \ taskqueue.9 taskqueue_create.9 \ taskqueue.9 taskqueue_create_fast.9 \ taskqueue.9 TASKQUEUE_DECLARE.9 \ taskqueue.9 TASKQUEUE_DEFINE.9 \ taskqueue.9 TASKQUEUE_DEFINE_THREAD.9 \ taskqueue.9 taskqueue_drain.9 \ taskqueue.9 taskqueue_drain_all.9 \ taskqueue.9 taskqueue_drain_timeout.9 \ taskqueue.9 taskqueue_enqueue.9 \ taskqueue.9 taskqueue_enqueue_timeout.9 \ taskqueue.9 TASKQUEUE_FAST_DEFINE.9 \ taskqueue.9 TASKQUEUE_FAST_DEFINE_THREAD.9 \ taskqueue.9 taskqueue_free.9 \ taskqueue.9 taskqueue_member.9 \ taskqueue.9 taskqueue_quiesce.9 \ taskqueue.9 taskqueue_run.9 \ taskqueue.9 taskqueue_set_callback.9 \ taskqueue.9 taskqueue_start_threads.9 \ taskqueue.9 taskqueue_start_threads_pinned.9 \ taskqueue.9 taskqueue_unblock.9 \ taskqueue.9 TIMEOUT_TASK_INIT.9 MLINKS+=tcp_functions.9 register_tcp_functions.9 \ tcp_functions.9 register_tcp_functions_as_name.9 \ tcp_functions.9 register_tcp_functions_as_names.9 \ tcp_functions.9 deregister_tcp_functions.9 MLINKS+=time.9 boottime.9 \ time.9 time_second.9 \ time.9 time_uptime.9 MLINKS+=timeout.9 callout.9 \ timeout.9 callout_active.9 \ timeout.9 callout_async_drain.9 \ timeout.9 callout_deactivate.9 \ timeout.9 callout_drain.9 \ timeout.9 callout_handle_init.9 \ timeout.9 callout_init.9 \ timeout.9 callout_init_mtx.9 \ timeout.9 callout_init_rm.9 \ timeout.9 callout_init_rw.9 \ timeout.9 callout_pending.9 \ timeout.9 callout_reset.9 \ timeout.9 callout_reset_curcpu.9 \ timeout.9 callout_reset_on.9 \ timeout.9 callout_reset_sbt.9 \ timeout.9 callout_reset_sbt_curcpu.9 \ timeout.9 callout_reset_sbt_on.9 \ timeout.9 callout_schedule.9 \ timeout.9 callout_schedule_curcpu.9 \ timeout.9 callout_schedule_on.9 \ timeout.9 callout_schedule_sbt.9 \ timeout.9 callout_schedule_sbt_curcpu.9 \ timeout.9 callout_schedule_sbt_on.9 \ timeout.9 callout_stop.9 \ timeout.9 callout_when.9 \ timeout.9 untimeout.9 MLINKS+=ucred.9 crcopy.9 \ ucred.9 crcopysafe.9 \ ucred.9 crdup.9 \ ucred.9 crfree.9 \ ucred.9 crget.9 \ ucred.9 crhold.9 \ ucred.9 crsetgroups.9 \ ucred.9 cru2x.9 MLINKS+=uidinfo.9 uifind.9 \ uidinfo.9 uifree.9 \ uidinfo.9 uihashinit.9 \ uidinfo.9 uihold.9 MLINKS+=uio.9 uiomove.9 \ uio.9 uiomove_frombuf.9 \ uio.9 uiomove_nofault.9 .if ${MK_USB} != "no" MAN+= usbdi.9 MLINKS+=usbdi.9 usbd_do_request.9 \ usbdi.9 usbd_do_request_flags.9 \ usbdi.9 usbd_errstr.9 \ usbdi.9 usbd_lookup_id_by_info.9 \ usbdi.9 usbd_lookup_id_by_uaa.9 \ usbdi.9 usbd_transfer_clear_stall.9 \ usbdi.9 usbd_transfer_drain.9 \ usbdi.9 usbd_transfer_pending.9 \ usbdi.9 usbd_transfer_poll.9 \ usbdi.9 usbd_transfer_setup.9 \ usbdi.9 usbd_transfer_start.9 \ usbdi.9 usbd_transfer_stop.9 \ usbdi.9 usbd_transfer_submit.9 \ usbdi.9 usbd_transfer_unsetup.9 \ usbdi.9 usbd_xfer_clr_flag.9 \ usbdi.9 usbd_xfer_frame_data.9 \ usbdi.9 usbd_xfer_frame_len.9 \ usbdi.9 usbd_xfer_get_frame.9 \ usbdi.9 usbd_xfer_get_priv.9 \ usbdi.9 usbd_xfer_is_stalled.9 \ usbdi.9 usbd_xfer_max_framelen.9 \ usbdi.9 usbd_xfer_max_frames.9 \ usbdi.9 usbd_xfer_max_len.9 \ usbdi.9 usbd_xfer_set_flag.9 \ usbdi.9 usbd_xfer_set_frame_data.9 \ usbdi.9 usbd_xfer_set_frame_len.9 \ usbdi.9 usbd_xfer_set_frame_offset.9 \ usbdi.9 usbd_xfer_set_frames.9 \ usbdi.9 usbd_xfer_set_interval.9 \ usbdi.9 usbd_xfer_set_priv.9 \ usbdi.9 usbd_xfer_set_stall.9 \ usbdi.9 usbd_xfer_set_timeout.9 \ usbdi.9 usbd_xfer_softc.9 \ usbdi.9 usbd_xfer_state.9 \ usbdi.9 usbd_xfer_status.9 \ usbdi.9 usb_fifo_alloc_buffer.9 \ usbdi.9 usb_fifo_attach.9 \ usbdi.9 usb_fifo_detach.9 \ usbdi.9 usb_fifo_free_buffer.9 \ usbdi.9 usb_fifo_get_data.9 \ usbdi.9 usb_fifo_get_data_buffer.9 \ usbdi.9 usb_fifo_get_data_error.9 \ usbdi.9 usb_fifo_get_data_linear.9 \ usbdi.9 usb_fifo_put_bytes_max.9 \ usbdi.9 usb_fifo_put_data.9 \ usbdi.9 usb_fifo_put_data_buffer.9 \ usbdi.9 usb_fifo_put_data_error.9 \ usbdi.9 usb_fifo_put_data_linear.9 \ usbdi.9 usb_fifo_reset.9 \ usbdi.9 usb_fifo_softc.9 \ usbdi.9 usb_fifo_wakeup.9 .endif MLINKS+=vcount.9 count_dev.9 MLINKS+=vfsconf.9 vfs_modevent.9 \ vfsconf.9 vfs_register.9 \ vfsconf.9 vfs_unregister.9 MLINKS+=vfs_getopt.9 vfs_copyopt.9 \ vfs_getopt.9 vfs_filteropt.9 \ vfs_getopt.9 vfs_flagopt.9 \ vfs_getopt.9 vfs_getopts.9 \ vfs_getopt.9 vfs_scanopt.9 \ vfs_getopt.9 vfs_setopt.9 \ vfs_getopt.9 vfs_setopt_part.9 \ vfs_getopt.9 vfs_setopts.9 MLINKS+=vhold.9 vdrop.9 \ vhold.9 vdropl.9 \ vhold.9 vholdl.9 MLINKS+=vmem.9 vmem_add.9 \ vmem.9 vmem_alloc.9 \ vmem.9 vmem_create.9 \ vmem.9 vmem_destroy.9 \ vmem.9 vmem_free.9 \ vmem.9 vmem_xalloc.9 \ vmem.9 vmem_xfree.9 MLINKS+=vm_map_lock.9 vm_map_lock_downgrade.9 \ vm_map_lock.9 vm_map_lock_read.9 \ vm_map_lock.9 vm_map_lock_upgrade.9 \ vm_map_lock.9 vm_map_trylock.9 \ vm_map_lock.9 vm_map_trylock_read.9 \ vm_map_lock.9 vm_map_unlock.9 \ vm_map_lock.9 vm_map_unlock_read.9 MLINKS+=vm_map_lookup.9 vm_map_lookup_done.9 MLINKS+=vm_map_max.9 vm_map_min.9 \ vm_map_max.9 vm_map_pmap.9 MLINKS+=vm_map_stack.9 vm_map_growstack.9 MLINKS+=vm_map_wire.9 vm_map_unwire.9 MLINKS+=vm_page_bits.9 vm_page_clear_dirty.9 \ vm_page_bits.9 vm_page_dirty.9 \ vm_page_bits.9 vm_page_is_valid.9 \ vm_page_bits.9 vm_page_set_invalid.9 \ vm_page_bits.9 vm_page_set_validclean.9 \ vm_page_bits.9 vm_page_test_dirty.9 \ vm_page_bits.9 vm_page_undirty.9 \ vm_page_bits.9 vm_page_zero_invalid.9 MLINKS+=vm_page_busy.9 vm_page_busied.9 \ vm_page_busy.9 vm_page_busy_downgrade.9 \ vm_page_busy.9 vm_page_busy_sleep.9 \ vm_page_busy.9 vm_page_sbusied.9 \ vm_page_busy.9 vm_page_sbusy.9 \ vm_page_busy.9 vm_page_sleep_if_busy.9 \ vm_page_busy.9 vm_page_sunbusy.9 \ vm_page_busy.9 vm_page_trysbusy.9 \ vm_page_busy.9 vm_page_tryxbusy.9 \ vm_page_busy.9 vm_page_xbusied.9 \ vm_page_busy.9 vm_page_xbusy.9 \ vm_page_busy.9 vm_page_xunbusy.9 \ vm_page_busy.9 vm_page_assert_sbusied.9 \ vm_page_busy.9 vm_page_assert_unbusied.9 \ vm_page_busy.9 vm_page_assert_xbusied.9 MLINKS+=vm_page_aflag.9 vm_page_aflag_clear.9 \ vm_page_aflag.9 vm_page_aflag_set.9 \ vm_page_aflag.9 vm_page_reference.9 MLINKS+=vm_page_free.9 vm_page_free_toq.9 \ vm_page_free.9 vm_page_free_zero.9 \ vm_page_free.9 vm_page_try_to_free.9 MLINKS+=vm_page_insert.9 vm_page_remove.9 MLINKS+=vm_page_wire.9 vm_page_unwire.9 MLINKS+=VOP_ACCESS.9 VOP_ACCESSX.9 MLINKS+=VOP_ATTRIB.9 VOP_GETATTR.9 \ VOP_ATTRIB.9 VOP_SETATTR.9 MLINKS+=VOP_CREATE.9 VOP_MKDIR.9 \ VOP_CREATE.9 VOP_MKNOD.9 \ VOP_CREATE.9 VOP_SYMLINK.9 MLINKS+=VOP_FSYNC.9 VOP_FDATASYNC.9 MLINKS+=VOP_GETPAGES.9 VOP_PUTPAGES.9 MLINKS+=VOP_INACTIVE.9 VOP_RECLAIM.9 MLINKS+=VOP_LOCK.9 vn_lock.9 \ VOP_LOCK.9 VOP_ISLOCKED.9 \ VOP_LOCK.9 VOP_UNLOCK.9 MLINKS+=VOP_OPENCLOSE.9 VOP_CLOSE.9 \ VOP_OPENCLOSE.9 VOP_OPEN.9 MLINKS+=VOP_RDWR.9 VOP_READ.9 \ VOP_RDWR.9 VOP_WRITE.9 MLINKS+=VOP_REMOVE.9 VOP_RMDIR.9 MLINKS+=vnet.9 vimage.9 MLINKS+=vref.9 VREF.9 \ vref.9 vrefl.9 MLINKS+=vrele.9 vput.9 \ vrele.9 vunref.9 MLINKS+=vslock.9 vsunlock.9 MLINKS+=zone.9 uma.9 \ zone.9 uma_zalloc.9 \ zone.9 uma_zalloc_arg.9 \ zone.9 uma_zalloc_domain.9 \ zone.9 uma_zcreate.9 \ zone.9 uma_zdestroy.9 \ zone.9 uma_zfree.9 \ zone.9 uma_zfree_arg.9 \ zone.9 uma_zfree_domain.9 \ zone.9 uma_zone_get_cur.9 \ zone.9 uma_zone_get_max.9 \ zone.9 uma_zone_set_max.9 \ zone.9 uma_zone_set_warning.9 \ zone.9 uma_zone_set_maxaction.9 .include Index: head/share/man/man9/g_bio.9 =================================================================== --- head/share/man/man9/g_bio.9 (revision 350693) +++ head/share/man/man9/g_bio.9 (revision 350694) @@ -1,306 +1,328 @@ .\" .\" Copyright (c) 2004-2006 Pawel Jakub Dawidek .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd Mar 7, 2018 +.Dd August 7, 2019 .Dt G_BIO 9 .Os .Sh NAME .Nm g_new_bio , .Nm g_clone_bio , .Nm g_destroy_bio , +.Nm g_format_bio , .Nm g_print_bio , .Nm g_reset_bio .Nd "GEOM bio controlling functions" .Sh SYNOPSIS .In sys/bio.h .In geom/geom.h .Ft "struct bio *" .Fn g_new_bio void .Ft "struct bio *" .Fn g_alloc_bio void .Ft "struct bio *" .Fn g_clone_bio "struct bio *bp" .Ft "struct bio *" .Fn g_duplicate_bio "struct bio *bp" .Ft void .Fn g_destroy_bio "struct bio *bp" .Ft void -.Fn g_print_bio "struct bio *bp" +.Fn g_format_bio "struct sbuf *sb" "const struct bio *bp" .Ft void +.Fo g_print_bio +.Fa "struct sbuf *sb" "const char *prefix" "const struct bio *bp" +.Fa "const char *fmtsuffix" ... +.Fc +.Ft void .Fn g_reset_bio "struct bio *bp" .Sh DESCRIPTION A .Vt "struct bio" is used by GEOM to describe I/O requests, its most important fields are described below: .Bl -tag -width ".Va bio_attribute" .It Va bio_cmd I/O request command. There are five I/O requests available in GEOM: .Bl -tag -width ".Dv BIO_GETATTR" .It Dv BIO_READ A read request. .It Dv BIO_WRITE A write request. .It Dv BIO_DELETE Indicates that a certain range of data is no longer used and that it can be erased or freed as the underlying technology supports. Technologies like flash adaptation layers can arrange to erase the relevant blocks before they will become reassigned and cryptographic devices may want to fill random bits into the range to reduce the amount of data available for attack. .It Dv BIO_GETATTR Inspect and manipulate out-of-band attributes on a particular provider or path. Attributes are named by ascii strings and are stored in the .Va bio_attribute field. .It Dv BIO_FLUSH Tells underlying providers to flush their write caches. .El .It Va bio_flags Available flags: .Bl -tag -width ".Dv BIO_ERROR" .It Dv BIO_ERROR Request failed (error value is stored in .Va bio_error field). .It Dv BIO_DONE Request finished. .El .It Va bio_cflags Private use by the consumer. .It Va bio_pflags Private use by the provider. .It Va bio_offset Offset into provider. .It Va bio_data Pointer to data buffer. .It Va bio_error Error value when .Dv BIO_ERROR is set. .It Va bio_done Pointer to function which will be called when the request is finished. .It Va bio_driver1 Private use by the provider. .It Va bio_driver2 Private use by the provider. .It Va bio_caller1 Private use by the consumer. .It Va bio_caller2 Private use by the consumer. .It Va bio_attribute Attribute string for .Dv BIO_GETATTR request. .It Va bio_from Consumer to use for request (attached to provider stored in .Va bio_to field) (typically read-only for a class). .It Va bio_to Destination provider (typically read-only for a class). .It Va bio_length Request length in bytes. .It Va bio_completed Number of bytes completed, but they may not be completed from the front of the request. .It Va bio_children Number of .Vt bio clones (typically read-only for a class). .It Va bio_inbed Number of finished .Vt bio clones. .It Va bio_parent Pointer to parent .Vt bio . .El .Pp The .Fn g_new_bio function allocates a new, empty .Vt bio structure. .Pp .Fn g_alloc_bio - same as .Fn g_new_bio , but always succeeds (allocates bio with the .Dv M_WAITOK malloc flag). .Pp The .Fn g_clone_bio function allocates a new .Vt bio structure and copies the following fields from the .Vt bio given as an argument to clone: .Va bio_cmd , .Va bio_length , .Va bio_offset , .Va bio_data , .Va bio_attribute . The field .Va bio_parent in the clone points to the passed .Vt bio and the field .Va bio_children in the passed .Vt bio is incremented. .Pp This function should be used for every request which enters through the provider of a particular geom and needs to be scheduled down. Proper order is: .Pp .Bl -enum -compact .It Clone the received .Vt "struct bio" . .It Modify the clone. .It Schedule the clone on its own consumer. .El .Pp .Fn g_duplicate_bio - same as .Fn g_clone_bio , but always succeeds (allocates bio with the .Dv M_WAITOK malloc flag). .Pp The .Fn g_destroy_bio function deallocates and destroys the given .Vt bio structure. .Pp The -.Fn g_print_bio +.Fn g_format_bio function prints information about the given .Vt bio -structure (for debugging purposes). +structure into the provided +.Vt sbuf . .Pp The +.Fn g_print_bio +function is a convenience wrapper around +.Fn g_format_bio +that can be used for debugging purposes. +It prints a provided +.Fa prefix +string, followed by the formatted +.Vt bio , +followed by a +.Fa fmtsuffix +in the style of +.Xr 9 printf . +Any of the prefix or suffix strings may be the empty string. +.Fn g_print_bio +always prints a newline character at the end of the line. +.Pp +The .Fn g_reset_bio function resets the given .Vt bio structure back to its initial state. .Fn g_reset_bio preserves internal data structures, while setting all user visible fields to their initial values. When reusing a .Vt bio obtained from .Fn g_new_bio , .Fn g_alloc_bio , .Fn g_clone_bio , or .Fn g_duplicate_bio for multiple transactions, .Fn g_reset_bio must be called between the transactions in lieu of .Fn bzero . While not strictly required for a .Vt bio structure created by other means, .Fn g_reset_bio should be used to initialize it and between transactions. .Sh RETURN VALUES The .Fn g_new_bio and .Fn g_clone_bio functions return a pointer to the allocated .Vt bio , or .Dv NULL if an error occurred. .Sh EXAMPLES Implementation of .Dq Dv NULL Ns -transformation , meaning that an I/O request is cloned and scheduled down without any modifications. Let us assume that field .Va ex_consumer in structure .Vt example_softc contains a consumer attached to the provider we want to operate on. .Bd -literal -offset indent void example_start(struct bio *bp) { struct example_softc *sc; struct bio *cbp; - printf("Request received: "); - g_print_bio(bp); - printf("\\n"); + g_print_bio("Request received: ", bp, ""); sc = bp->bio_to->geom->softc; if (sc == NULL) { g_io_deliver(bp, ENXIO); return; } /* Let's clone our bio request. */ cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; /* Standard 'done' function. */ /* Ok, schedule it down. */ /* * The consumer can be obtained from * LIST_FIRST(&bp->bio_to->geom->consumer) as well, * if there is only one in our geom. */ g_io_request(cbp, sc->ex_consumer); } .Ed .Sh SEE ALSO .Xr geom 4 , .Xr DECLARE_GEOM_CLASS 9 , .Xr g_access 9 , .Xr g_attach 9 , .Xr g_consumer 9 , .Xr g_data 9 , .Xr g_event 9 , .Xr g_geom 9 , .Xr g_provider 9 , .Xr g_provider_by_name 9 , .Xr g_wither_geom 9 .Sh AUTHORS .An -nosplit This manual page was written by .An Pawel Jakub Dawidek Aq Mt pjd@FreeBSD.org . Index: head/sys/dev/fdc/fdc.c =================================================================== --- head/sys/dev/fdc/fdc.c (revision 350693) +++ head/sys/dev/fdc/fdc.c (revision 350694) @@ -1,2106 +1,2104 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Don Ahn. * * Libretto PCMCIA floppy support by David Horwitt (dhorwitt@ucsd.edu) * aided by the Linux floppy driver modifications from David Bateman * (dbateman@eng.uts.edu.au). * * Copyright (c) 1993, 1994 by * jc@irbs.UUCP (John Capo) * vak@zebub.msk.su (Serge Vakulenko) * ache@astral.msk.su (Andrew A. Chernov) * * Copyright (c) 1993, 1994, 1995 by * joerg_wunsch@uriah.sax.de (Joerg Wunsch) * dufault@hda.com (Peter Dufault) * * Copyright (c) 2001 Joerg Wunsch, * joerg_wunsch@uriah.heep.sax.de (Joerg Wunsch) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)fd.c 7.4 (Berkeley) 5/25/91 * */ #include __FBSDID("$FreeBSD$"); #include "opt_fdc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Runtime configuration hints/flags */ /* configuration flags for fd */ #define FD_TYPEMASK 0x0f /* drive type, matches enum * fd_drivetype; on i386 machines, if * given as 0, use RTC type for fd0 * and fd1 */ #define FD_NO_CHLINE 0x10 /* drive does not support changeline * aka. unit attention */ #define FD_NO_PROBE 0x20 /* don't probe drive (seek test), just * assume it is there */ /* * Things that could conceiveably considered parameters or tweakables */ /* * Maximal number of bytes in a cylinder. * This is used for ISADMA bouncebuffer allocation and sets the max * xfersize we support. * * 2.88M format has 2 x 36 x 512, allow for hacked up density. */ #define MAX_BYTES_PER_CYL (2 * 40 * 512) /* * Timeout value for the PIO loops to wait until the FDC main status * register matches our expectations (request for master, direction * bit). This is supposed to be a number of microseconds, although * timing might actually not be very accurate. * * Timeouts of 100 msec are believed to be required for some broken * (old) hardware. */ #define FDSTS_TIMEOUT 100000 /* * After this many errors, stop whining. Close will reset this count. */ #define FDC_ERRMAX 100 /* * AutoDensity search lists for each drive type. */ static struct fd_type fd_searchlist_360k[] = { { FDF_5_360 }, { 0 } }; static struct fd_type fd_searchlist_12m[] = { { FDF_5_1200 | FL_AUTO }, { FDF_5_400 | FL_AUTO }, { FDF_5_360 | FL_2STEP | FL_AUTO}, { 0 } }; static struct fd_type fd_searchlist_720k[] = { { FDF_3_720 }, { 0 } }; static struct fd_type fd_searchlist_144m[] = { { FDF_3_1440 | FL_AUTO}, { FDF_3_720 | FL_AUTO}, { 0 } }; static struct fd_type fd_searchlist_288m[] = { { FDF_3_1440 | FL_AUTO }, #if 0 { FDF_3_2880 | FL_AUTO }, /* XXX: probably doesn't work */ #endif { FDF_3_720 | FL_AUTO}, { 0 } }; /* * Order must match enum fd_drivetype in . */ static struct fd_type *fd_native_types[] = { NULL, /* FDT_NONE */ fd_searchlist_360k, /* FDT_360K */ fd_searchlist_12m, /* FDT_12M */ fd_searchlist_720k, /* FDT_720K */ fd_searchlist_144m, /* FDT_144M */ fd_searchlist_288m, /* FDT_288M_1 (mapped to FDT_288M) */ fd_searchlist_288m, /* FDT_288M */ }; /* * Internals start here */ /* registers */ #define FDOUT 2 /* Digital Output Register (W) */ #define FDO_FDSEL 0x03 /* floppy device select */ #define FDO_FRST 0x04 /* floppy controller reset */ #define FDO_FDMAEN 0x08 /* enable floppy DMA and Interrupt */ #define FDO_MOEN0 0x10 /* motor enable drive 0 */ #define FDO_MOEN1 0x20 /* motor enable drive 1 */ #define FDO_MOEN2 0x40 /* motor enable drive 2 */ #define FDO_MOEN3 0x80 /* motor enable drive 3 */ #define FDSTS 4 /* NEC 765 Main Status Register (R) */ #define FDDSR 4 /* Data Rate Select Register (W) */ #define FDDATA 5 /* NEC 765 Data Register (R/W) */ #define FDCTL 7 /* Control Register (W) */ /* * The YE-DATA PC Card floppies use PIO to read in the data rather * than DMA due to the wild variability of DMA for the PC Card * devices. DMA was deleted from the PC Card specification in version * 7.2 of the standard, but that post-dates the YE-DATA devices by many * years. * * In addition, if we cannot setup the DMA resources for the ISA * attachment, we'll use this same offset for data transfer. However, * that almost certainly won't work. * * For this mode, offset 0 and 1 must be used to setup the transfer * for this floppy. This is OK for PC Card YE Data devices, but for * ISA this is likely wrong. These registers are only available on * those systems that map them to the floppy drive. Newer systems do * not do this, and we should likely prohibit access to them (or * disallow NODMA to be set). */ #define FDBCDR 0 /* And 1 */ #define FD_YE_DATAPORT 6 /* Drive Data port */ #define FDI_DCHG 0x80 /* diskette has been changed */ /* requires drive and motor being selected */ /* is cleared by any step pulse to drive */ /* * We have three private BIO commands. */ #define BIO_PROBE BIO_CMD0 #define BIO_RDID BIO_CMD1 #define BIO_FMT BIO_CMD2 /* * Per drive structure (softc). */ struct fd_data { u_char *fd_ioptr; /* IO pointer */ u_int fd_iosize; /* Size of IO chunks */ u_int fd_iocount; /* Outstanding requests */ struct fdc_data *fdc; /* pointer to controller structure */ int fdsu; /* this units number on this controller */ enum fd_drivetype type; /* drive type */ struct fd_type *ft; /* pointer to current type descriptor */ struct fd_type fts; /* type descriptors */ int sectorsize; int flags; #define FD_WP (1<<0) /* Write protected */ #define FD_MOTOR (1<<1) /* motor should be on */ #define FD_MOTORWAIT (1<<2) /* motor should be on */ #define FD_EMPTY (1<<3) /* no media */ #define FD_NEWDISK (1<<4) /* media changed */ #define FD_ISADMA (1<<5) /* isa dma started */ int track; /* where we think the head is */ #define FD_NO_TRACK -2 int options; /* FDOPT_* */ struct callout toffhandle; struct g_geom *fd_geom; struct g_provider *fd_provider; device_t dev; struct bio_queue_head fd_bq; }; #define FD_NOT_VALID -2 static driver_intr_t fdc_intr; static driver_filter_t fdc_intr_fast; static void fdc_reset(struct fdc_data *); static int fd_probe_disk(struct fd_data *, int *); static SYSCTL_NODE(_debug, OID_AUTO, fdc, CTLFLAG_RW, 0, "fdc driver"); static int fifo_threshold = 8; SYSCTL_INT(_debug_fdc, OID_AUTO, fifo, CTLFLAG_RW, &fifo_threshold, 0, "FIFO threshold setting"); static int debugflags = 0; SYSCTL_INT(_debug_fdc, OID_AUTO, debugflags, CTLFLAG_RW, &debugflags, 0, "Debug flags"); static int retries = 10; SYSCTL_INT(_debug_fdc, OID_AUTO, retries, CTLFLAG_RW, &retries, 0, "Number of retries to attempt"); static int spec1 = NE7_SPEC_1(6, 240); SYSCTL_INT(_debug_fdc, OID_AUTO, spec1, CTLFLAG_RW, &spec1, 0, "Specification byte one (step-rate + head unload)"); static int spec2 = NE7_SPEC_2(16, 0); SYSCTL_INT(_debug_fdc, OID_AUTO, spec2, CTLFLAG_RW, &spec2, 0, "Specification byte two (head load time + no-dma)"); static int settle; SYSCTL_INT(_debug_fdc, OID_AUTO, settle, CTLFLAG_RW, &settle, 0, "Head settling time in sec/hz"); static void fdprinttype(struct fd_type *ft) { printf("(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,0x%x)", ft->sectrac, ft->secsize, ft->datalen, ft->gap, ft->tracks, ft->size, ft->trans, ft->heads, ft->f_gap, ft->f_inter, ft->offset_side2, ft->flags); } static void fdsettype(struct fd_data *fd, struct fd_type *ft) { fd->ft = ft; ft->size = ft->sectrac * ft->heads * ft->tracks; fd->sectorsize = 128 << fd->ft->secsize; } /* * Bus space handling (access to low-level IO). */ static inline void fdregwr(struct fdc_data *fdc, int reg, uint8_t v) { bus_space_write_1(fdc->iot, fdc->ioh[reg], fdc->ioff[reg], v); } static inline uint8_t fdregrd(struct fdc_data *fdc, int reg) { return bus_space_read_1(fdc->iot, fdc->ioh[reg], fdc->ioff[reg]); } static void fdctl_wr(struct fdc_data *fdc, u_int8_t v) { fdregwr(fdc, FDCTL, v); } static void fdout_wr(struct fdc_data *fdc, u_int8_t v) { fdregwr(fdc, FDOUT, v); } static u_int8_t fdsts_rd(struct fdc_data *fdc) { return fdregrd(fdc, FDSTS); } static void fddsr_wr(struct fdc_data *fdc, u_int8_t v) { fdregwr(fdc, FDDSR, v); } static void fddata_wr(struct fdc_data *fdc, u_int8_t v) { fdregwr(fdc, FDDATA, v); } static u_int8_t fddata_rd(struct fdc_data *fdc) { return fdregrd(fdc, FDDATA); } static u_int8_t fdin_rd(struct fdc_data *fdc) { return fdregrd(fdc, FDCTL); } /* * Magic pseudo-DMA initialization for YE FDC. Sets count and * direction. */ static void fdbcdr_wr(struct fdc_data *fdc, int iswrite, uint16_t count) { fdregwr(fdc, FDBCDR, (count - 1) & 0xff); fdregwr(fdc, FDBCDR + 1, (iswrite ? 0x80 : 0) | (((count - 1) >> 8) & 0x7f)); } static int fdc_err(struct fdc_data *fdc, const char *s) { fdc->fdc_errs++; if (s) { if (fdc->fdc_errs < FDC_ERRMAX) device_printf(fdc->fdc_dev, "%s", s); else if (fdc->fdc_errs == FDC_ERRMAX) device_printf(fdc->fdc_dev, "too many errors, not " "logging any more\n"); } return (1); } /* * FDC IO functions, take care of the main status register, timeout * in case the desired status bits are never set. * * These PIO loops initially start out with short delays between * each iteration in the expectation that the required condition * is usually met quickly, so it can be handled immediately. */ static int fdc_in(struct fdc_data *fdc, int *ptr) { int i, j, step; step = 1; for (j = 0; j < FDSTS_TIMEOUT; j += step) { i = fdsts_rd(fdc) & (NE7_DIO | NE7_RQM); if (i == (NE7_DIO|NE7_RQM)) { i = fddata_rd(fdc); if (ptr) *ptr = i; return (0); } if (i == NE7_RQM) return (fdc_err(fdc, "ready for output in input\n")); step += step; DELAY(step); } return (fdc_err(fdc, bootverbose? "input ready timeout\n": 0)); } static int fdc_out(struct fdc_data *fdc, int x) { int i, j, step; step = 1; for (j = 0; j < FDSTS_TIMEOUT; j += step) { i = fdsts_rd(fdc) & (NE7_DIO | NE7_RQM); if (i == NE7_RQM) { fddata_wr(fdc, x); return (0); } if (i == (NE7_DIO|NE7_RQM)) return (fdc_err(fdc, "ready for input in output\n")); step += step; DELAY(step); } return (fdc_err(fdc, bootverbose? "output ready timeout\n": 0)); } /* * fdc_cmd: Send a command to the chip. * Takes a varargs with this structure: * # of output bytes * output bytes as int [...] * # of input bytes * input bytes as int* [...] */ static int fdc_cmd(struct fdc_data *fdc, int n_out, ...) { u_char cmd = 0; int n_in; int n, i; va_list ap; va_start(ap, n_out); for (n = 0; n < n_out; n++) { i = va_arg(ap, int); if (n == 0) cmd = i; if (fdc_out(fdc, i) < 0) { char msg[50]; snprintf(msg, sizeof(msg), "cmd %x failed at out byte %d of %d\n", cmd, n + 1, n_out); fdc->flags |= FDC_NEEDS_RESET; va_end(ap); return fdc_err(fdc, msg); } } n_in = va_arg(ap, int); for (n = 0; n < n_in; n++) { int *ptr = va_arg(ap, int *); if (fdc_in(fdc, ptr) < 0) { char msg[50]; snprintf(msg, sizeof(msg), "cmd %02x failed at in byte %d of %d\n", cmd, n + 1, n_in); fdc->flags |= FDC_NEEDS_RESET; va_end(ap); return fdc_err(fdc, msg); } } va_end(ap); return (0); } static void fdc_reset(struct fdc_data *fdc) { int i, r[10]; if (fdc->fdct == FDC_ENHANCED) { /* Try a software reset, default precomp, and 500 kb/s */ fddsr_wr(fdc, I8207X_DSR_SR); } else { /* Try a hardware reset, keep motor on */ fdout_wr(fdc, fdc->fdout & ~(FDO_FRST|FDO_FDMAEN)); DELAY(100); /* enable FDC, but defer interrupts a moment */ fdout_wr(fdc, fdc->fdout & ~FDO_FDMAEN); } DELAY(100); fdout_wr(fdc, fdc->fdout); /* XXX after a reset, silently believe the FDC will accept commands */ if (fdc_cmd(fdc, 3, NE7CMD_SPECIFY, spec1, spec2, 0)) device_printf(fdc->fdc_dev, " SPECIFY failed in reset\n"); if (fdc->fdct == FDC_ENHANCED) { if (fdc_cmd(fdc, 4, I8207X_CONFIG, 0, /* 0x40 | */ /* Enable Implied Seek - * breaks 2step! */ 0x10 | /* Polling disabled */ (fifo_threshold - 1), /* Fifo threshold */ 0x00, /* Precomp track */ 0)) device_printf(fdc->fdc_dev, " CONFIGURE failed in reset\n"); if (debugflags & 1) { if (fdc_cmd(fdc, 1, I8207X_DUMPREG, 10, &r[0], &r[1], &r[2], &r[3], &r[4], &r[5], &r[6], &r[7], &r[8], &r[9])) device_printf(fdc->fdc_dev, " DUMPREG failed in reset\n"); for (i = 0; i < 10; i++) printf(" %02x", r[i]); printf("\n"); } } } static int fdc_sense_drive(struct fdc_data *fdc, int *st3p) { int st3; if (fdc_cmd(fdc, 2, NE7CMD_SENSED, fdc->fd->fdsu, 1, &st3)) return (fdc_err(fdc, "Sense Drive Status failed\n")); if (st3p) *st3p = st3; return (0); } static int fdc_sense_int(struct fdc_data *fdc, int *st0p, int *cylp) { int cyl, st0, ret; ret = fdc_cmd(fdc, 1, NE7CMD_SENSEI, 1, &st0); if (ret) { (void)fdc_err(fdc, "sense intr err reading stat reg 0\n"); return (ret); } if (st0p) *st0p = st0; if ((st0 & NE7_ST0_IC) == NE7_ST0_IC_IV) { /* * There doesn't seem to have been an interrupt. */ return (FD_NOT_VALID); } if (fdc_in(fdc, &cyl) < 0) return fdc_err(fdc, "can't get cyl num\n"); if (cylp) *cylp = cyl; return (0); } static int fdc_read_status(struct fdc_data *fdc) { int i, ret, status; for (i = ret = 0; i < 7; i++) { ret = fdc_in(fdc, &status); fdc->status[i] = status; if (ret != 0) break; } if (ret == 0) fdc->flags |= FDC_STAT_VALID; else fdc->flags &= ~FDC_STAT_VALID; return ret; } /* * Select this drive */ static void fd_select(struct fd_data *fd) { struct fdc_data *fdc; /* XXX: lock controller */ fdc = fd->fdc; fdc->fdout &= ~FDO_FDSEL; fdc->fdout |= FDO_FDMAEN | FDO_FRST | fd->fdsu; fdout_wr(fdc, fdc->fdout); } static void fd_turnon(void *arg) { struct fd_data *fd; struct bio *bp; int once; fd = arg; mtx_assert(&fd->fdc->fdc_mtx, MA_OWNED); fd->flags &= ~FD_MOTORWAIT; fd->flags |= FD_MOTOR; once = 0; for (;;) { bp = bioq_takefirst(&fd->fd_bq); if (bp == NULL) break; bioq_disksort(&fd->fdc->head, bp); once = 1; } if (once) wakeup(&fd->fdc->head); } static void fd_motor(struct fd_data *fd, int turnon) { struct fdc_data *fdc; fdc = fd->fdc; /* mtx_assert(&fdc->fdc_mtx, MA_OWNED); */ if (turnon) { fd->flags |= FD_MOTORWAIT; fdc->fdout |= (FDO_MOEN0 << fd->fdsu); callout_reset(&fd->toffhandle, hz, fd_turnon, fd); } else { callout_stop(&fd->toffhandle); fd->flags &= ~(FD_MOTOR|FD_MOTORWAIT); fdc->fdout &= ~(FDO_MOEN0 << fd->fdsu); } fdout_wr(fdc, fdc->fdout); } static void fd_turnoff(void *xfd) { struct fd_data *fd = xfd; mtx_assert(&fd->fdc->fdc_mtx, MA_OWNED); fd_motor(fd, 0); } /* * fdc_intr - wake up the worker thread. */ static void fdc_intr(void *arg) { wakeup(arg); } static int fdc_intr_fast(void *arg) { wakeup(arg); return(FILTER_HANDLED); } /* * fdc_pio(): perform programmed IO read/write for YE PCMCIA floppy. */ static void fdc_pio(struct fdc_data *fdc) { u_char *cptr; struct bio *bp; u_int count; bp = fdc->bp; cptr = fdc->fd->fd_ioptr; count = fdc->fd->fd_iosize; if (bp->bio_cmd == BIO_READ) { fdbcdr_wr(fdc, 0, count); bus_space_read_multi_1(fdc->iot, fdc->ioh[FD_YE_DATAPORT], fdc->ioff[FD_YE_DATAPORT], cptr, count); } else { bus_space_write_multi_1(fdc->iot, fdc->ioh[FD_YE_DATAPORT], fdc->ioff[FD_YE_DATAPORT], cptr, count); fdbcdr_wr(fdc, 0, count); /* needed? */ } } static int fdc_biodone(struct fdc_data *fdc, int error) { struct fd_data *fd; struct bio *bp; fd = fdc->fd; bp = fdc->bp; mtx_lock(&fdc->fdc_mtx); if (--fd->fd_iocount == 0) callout_reset(&fd->toffhandle, 4 * hz, fd_turnoff, fd); fdc->bp = NULL; fdc->fd = NULL; mtx_unlock(&fdc->fdc_mtx); if (bp->bio_to != NULL) { if ((debugflags & 2) && fd->fdc->retry > 0) printf("retries: %d\n", fd->fdc->retry); g_io_deliver(bp, error); return (0); } bp->bio_error = error; bp->bio_flags |= BIO_DONE; wakeup(bp); return (0); } static int retry_line; static int fdc_worker(struct fdc_data *fdc) { struct fd_data *fd; struct bio *bp; int i, nsect; int st0, st3, cyl, mfm, steptrac, cylinder, descyl, sec; int head; int override_error; static int need_recal; struct fdc_readid *idp; struct fd_formb *finfo; override_error = 0; /* Have we exhausted our retries ? */ bp = fdc->bp; fd = fdc->fd; if (bp != NULL && (fdc->retry >= retries || (fd->options & FDOPT_NORETRY))) { if ((debugflags & 4)) printf("Too many retries (EIO)\n"); if (fdc->flags & FDC_NEEDS_RESET) { mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_EMPTY; mtx_unlock(&fdc->fdc_mtx); } return (fdc_biodone(fdc, EIO)); } /* Disable ISADMA if we bailed while it was active */ if (fd != NULL && (fd->flags & FD_ISADMA)) { isa_dmadone( bp->bio_cmd == BIO_READ ? ISADMA_READ : ISADMA_WRITE, fd->fd_ioptr, fd->fd_iosize, fdc->dmachan); mtx_lock(&fdc->fdc_mtx); fd->flags &= ~FD_ISADMA; mtx_unlock(&fdc->fdc_mtx); } /* Unwedge the controller ? */ if (fdc->flags & FDC_NEEDS_RESET) { fdc->flags &= ~FDC_NEEDS_RESET; fdc_reset(fdc); if (cold) DELAY(1000000); else tsleep(fdc, PRIBIO, "fdcrst", hz); /* Discard results */ for (i = 0; i < 4; i++) fdc_sense_int(fdc, &st0, &cyl); /* All drives must recal */ need_recal = 0xf; } /* Pick up a request, if need be wait for it */ if (fdc->bp == NULL) { mtx_lock(&fdc->fdc_mtx); do { fdc->bp = bioq_takefirst(&fdc->head); if (fdc->bp == NULL) msleep(&fdc->head, &fdc->fdc_mtx, PRIBIO, "-", 0); } while (fdc->bp == NULL && (fdc->flags & FDC_KTHREAD_EXIT) == 0); mtx_unlock(&fdc->fdc_mtx); if (fdc->bp == NULL) /* * Nothing to do, worker thread has been * requested to stop. */ return (0); bp = fdc->bp; fd = fdc->fd = bp->bio_driver1; fdc->retry = 0; fd->fd_ioptr = bp->bio_data; if (bp->bio_cmd == BIO_FMT) { i = offsetof(struct fd_formb, fd_formb_cylno(0)); fd->fd_ioptr += i; fd->fd_iosize = bp->bio_length - i; } } /* Select drive, setup params */ fd_select(fd); if (fdc->fdct == FDC_ENHANCED) fddsr_wr(fdc, fd->ft->trans); else fdctl_wr(fdc, fd->ft->trans); if (bp->bio_cmd == BIO_PROBE) { if ((!(device_get_flags(fd->dev) & FD_NO_CHLINE) && !(fdin_rd(fdc) & FDI_DCHG) && !(fd->flags & FD_EMPTY)) || fd_probe_disk(fd, &need_recal) == 0) return (fdc_biodone(fdc, 0)); return (1); } /* * If we are dead just flush the requests */ if (fd->flags & FD_EMPTY) return (fdc_biodone(fdc, ENXIO)); /* Check if we lost our media */ if (fdin_rd(fdc) & FDI_DCHG) { if (debugflags & 0x40) printf("Lost disk\n"); mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_EMPTY; fd->flags |= FD_NEWDISK; mtx_unlock(&fdc->fdc_mtx); g_topology_lock(); g_orphan_provider(fd->fd_provider, ENXIO); fd->fd_provider->flags |= G_PF_WITHER; fd->fd_provider = g_new_providerf(fd->fd_geom, "%s", fd->fd_geom->name); g_error_provider(fd->fd_provider, 0); g_topology_unlock(); return (fdc_biodone(fdc, ENXIO)); } /* Check if the floppy is write-protected */ if (bp->bio_cmd == BIO_FMT || bp->bio_cmd == BIO_WRITE) { retry_line = __LINE__; if(fdc_sense_drive(fdc, &st3) != 0) return (1); if(st3 & NE7_ST3_WP) return (fdc_biodone(fdc, EROFS)); } mfm = (fd->ft->flags & FL_MFM)? NE7CMD_MFM: 0; steptrac = (fd->ft->flags & FL_2STEP)? 2: 1; i = fd->ft->sectrac * fd->ft->heads; cylinder = bp->bio_pblkno / i; descyl = cylinder * steptrac; sec = bp->bio_pblkno % i; nsect = i - sec; head = sec / fd->ft->sectrac; sec = sec % fd->ft->sectrac + 1; /* If everything is going swimmingly, use multisector xfer */ if (fdc->retry == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { fd->fd_iosize = imin(nsect * fd->sectorsize, bp->bio_resid); nsect = fd->fd_iosize / fd->sectorsize; } else if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { fd->fd_iosize = fd->sectorsize; nsect = 1; } /* Do RECAL if we need to or are going to track zero anyway */ if ((need_recal & (1 << fd->fdsu)) || (cylinder == 0 && fd->track != 0) || fdc->retry > 2) { retry_line = __LINE__; if (fdc_cmd(fdc, 2, NE7CMD_RECAL, fd->fdsu, 0)) return (1); tsleep(fdc, PRIBIO, "fdrecal", hz); retry_line = __LINE__; if (fdc_sense_int(fdc, &st0, &cyl) == FD_NOT_VALID) return (1); /* XXX */ retry_line = __LINE__; if ((st0 & 0xc0) || cyl != 0) return (1); need_recal &= ~(1 << fd->fdsu); fd->track = 0; /* let the heads settle */ if (settle) tsleep(fdc->fd, PRIBIO, "fdhdstl", settle); } /* * SEEK to where we want to be */ if (cylinder != fd->track) { retry_line = __LINE__; if (fdc_cmd(fdc, 3, NE7CMD_SEEK, fd->fdsu, descyl, 0)) return (1); tsleep(fdc, PRIBIO, "fdseek", hz); retry_line = __LINE__; if (fdc_sense_int(fdc, &st0, &cyl) == FD_NOT_VALID) return (1); /* XXX */ retry_line = __LINE__; if ((st0 & 0xc0) || cyl != descyl) { need_recal |= (1 << fd->fdsu); return (1); } /* let the heads settle */ if (settle) tsleep(fdc->fd, PRIBIO, "fdhdstl", settle); } fd->track = cylinder; if (debugflags & 8) printf("op %x bn %ju siz %u ptr %p retry %d\n", bp->bio_cmd, bp->bio_pblkno, fd->fd_iosize, fd->fd_ioptr, fdc->retry); /* Setup ISADMA if we need it and have it */ if ((bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FMT) && !(fdc->flags & FDC_NODMA)) { isa_dmastart( bp->bio_cmd == BIO_READ ? ISADMA_READ : ISADMA_WRITE, fd->fd_ioptr, fd->fd_iosize, fdc->dmachan); mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_ISADMA; mtx_unlock(&fdc->fdc_mtx); } /* Do PIO if we have to */ if (fdc->flags & FDC_NODMA) { if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FMT) fdbcdr_wr(fdc, 1, fd->fd_iosize); if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FMT) fdc_pio(fdc); } switch(bp->bio_cmd) { case BIO_FMT: /* formatting */ finfo = (struct fd_formb *)bp->bio_data; retry_line = __LINE__; if (fdc_cmd(fdc, 6, NE7CMD_FORMAT | mfm, head << 2 | fd->fdsu, finfo->fd_formb_secshift, finfo->fd_formb_nsecs, finfo->fd_formb_gaplen, finfo->fd_formb_fillbyte, 0)) return (1); break; case BIO_RDID: retry_line = __LINE__; if (fdc_cmd(fdc, 2, NE7CMD_READID | mfm, head << 2 | fd->fdsu, 0)) return (1); break; case BIO_READ: retry_line = __LINE__; if (fdc_cmd(fdc, 9, NE7CMD_READ | NE7CMD_SK | mfm | NE7CMD_MT, head << 2 | fd->fdsu, /* head & unit */ fd->track, /* track */ head, /* head */ sec, /* sector + 1 */ fd->ft->secsize, /* sector size */ fd->ft->sectrac, /* sectors/track */ fd->ft->gap, /* gap size */ fd->ft->datalen, /* data length */ 0)) return (1); break; case BIO_WRITE: retry_line = __LINE__; if (fdc_cmd(fdc, 9, NE7CMD_WRITE | mfm | NE7CMD_MT, head << 2 | fd->fdsu, /* head & unit */ fd->track, /* track */ head, /* head */ sec, /* sector + 1 */ fd->ft->secsize, /* sector size */ fd->ft->sectrac, /* sectors/track */ fd->ft->gap, /* gap size */ fd->ft->datalen, /* data length */ 0)) return (1); break; default: KASSERT(0 == 1, ("Wrong bio_cmd %x\n", bp->bio_cmd)); } /* Wait for interrupt */ i = tsleep(fdc, PRIBIO, "fddata", hz); /* PIO if the read looks good */ if (i == 0 && (fdc->flags & FDC_NODMA) && (bp->bio_cmd == BIO_READ)) fdc_pio(fdc); /* Finish DMA */ if (fd->flags & FD_ISADMA) { isa_dmadone( bp->bio_cmd == BIO_READ ? ISADMA_READ : ISADMA_WRITE, fd->fd_ioptr, fd->fd_iosize, fdc->dmachan); mtx_lock(&fdc->fdc_mtx); fd->flags &= ~FD_ISADMA; mtx_unlock(&fdc->fdc_mtx); } if (i != 0) { /* * Timeout. * * Due to IBM's brain-dead design, the FDC has a faked ready * signal, hardwired to ready == true. Thus, any command * issued if there's no diskette in the drive will _never_ * complete, and must be aborted by resetting the FDC. * Many thanks, Big Blue! */ retry_line = __LINE__; fdc->flags |= FDC_NEEDS_RESET; return (1); } retry_line = __LINE__; if (fdc_read_status(fdc)) return (1); if (debugflags & 0x10) printf(" -> %x %x %x %x\n", fdc->status[0], fdc->status[1], fdc->status[2], fdc->status[3]); st0 = fdc->status[0] & NE7_ST0_IC; if (st0 != 0) { retry_line = __LINE__; if (st0 == NE7_ST0_IC_AT && fdc->status[1] & NE7_ST1_OR) { /* * DMA overrun. Someone hogged the bus and * didn't release it in time for the next * FDC transfer. */ return (1); } retry_line = __LINE__; if(st0 == NE7_ST0_IC_IV) { fdc->flags |= FDC_NEEDS_RESET; return (1); } retry_line = __LINE__; if(st0 == NE7_ST0_IC_AT && fdc->status[2] & NE7_ST2_WC) { need_recal |= (1 << fd->fdsu); return (1); } if (debugflags & 0x20) { printf("status %02x %02x %02x %02x %02x %02x\n", fdc->status[0], fdc->status[1], fdc->status[2], fdc->status[3], fdc->status[4], fdc->status[5]); } retry_line = __LINE__; if (fd->options & FDOPT_NOERROR) override_error = 1; else return (1); } /* All OK */ switch(bp->bio_cmd) { case BIO_RDID: /* copy out ID field contents */ idp = (struct fdc_readid *)bp->bio_data; idp->cyl = fdc->status[3]; idp->head = fdc->status[4]; idp->sec = fdc->status[5]; idp->secshift = fdc->status[6]; if (debugflags & 0x40) printf("c %d h %d s %d z %d\n", idp->cyl, idp->head, idp->sec, idp->secshift); break; case BIO_READ: case BIO_WRITE: bp->bio_pblkno += nsect; bp->bio_resid -= fd->fd_iosize; bp->bio_completed += fd->fd_iosize; fd->fd_ioptr += fd->fd_iosize; if (override_error) { if ((debugflags & 4)) printf("FDOPT_NOERROR: returning bad data\n"); } else { /* Since we managed to get something done, * reset the retry */ fdc->retry = 0; if (bp->bio_resid > 0) return (0); } break; case BIO_FMT: break; } return (fdc_biodone(fdc, 0)); } static void fdc_thread(void *arg) { struct fdc_data *fdc; fdc = arg; int i; mtx_lock(&fdc->fdc_mtx); fdc->flags |= FDC_KTHREAD_ALIVE; while ((fdc->flags & FDC_KTHREAD_EXIT) == 0) { mtx_unlock(&fdc->fdc_mtx); i = fdc_worker(fdc); if (i && debugflags & 0x20) { - if (fdc->bp != NULL) { - g_print_bio(fdc->bp); - printf("\n"); - } + if (fdc->bp != NULL) + g_print_bio("", fdc->bp, ""); printf("Retry line %d\n", retry_line); } fdc->retry += i; mtx_lock(&fdc->fdc_mtx); } fdc->flags &= ~(FDC_KTHREAD_EXIT | FDC_KTHREAD_ALIVE); mtx_unlock(&fdc->fdc_mtx); kproc_exit(0); } /* * Enqueue a request. */ static void fd_enqueue(struct fd_data *fd, struct bio *bp) { struct fdc_data *fdc; int call; call = 0; fdc = fd->fdc; mtx_lock(&fdc->fdc_mtx); /* If we go from idle, cancel motor turnoff */ if (fd->fd_iocount++ == 0) callout_stop(&fd->toffhandle); if (fd->flags & FD_MOTOR) { /* The motor is on, send it directly to the controller */ bioq_disksort(&fdc->head, bp); wakeup(&fdc->head); } else { /* Queue it on the drive until the motor has started */ bioq_insert_tail(&fd->fd_bq, bp); if (!(fd->flags & FD_MOTORWAIT)) fd_motor(fd, 1); } mtx_unlock(&fdc->fdc_mtx); } /* * Try to find out if we have a disk in the drive. */ static int fd_probe_disk(struct fd_data *fd, int *recal) { struct fdc_data *fdc; int st0, st3, cyl; int oopts, ret; fdc = fd->fdc; oopts = fd->options; fd->options |= FDOPT_NOERRLOG | FDOPT_NORETRY; ret = 1; /* * First recal, then seek to cyl#1, this clears the old condition on * the disk change line so we can examine it for current status. */ if (debugflags & 0x40) printf("New disk in probe\n"); mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_NEWDISK; mtx_unlock(&fdc->fdc_mtx); if (fdc_cmd(fdc, 2, NE7CMD_RECAL, fd->fdsu, 0)) goto done; tsleep(fdc, PRIBIO, "fdrecal", hz); if (fdc_sense_int(fdc, &st0, &cyl) == FD_NOT_VALID) goto done; /* XXX */ if ((st0 & 0xc0) || cyl != 0) goto done; /* Seek to track 1 */ if (fdc_cmd(fdc, 3, NE7CMD_SEEK, fd->fdsu, 1, 0)) goto done; tsleep(fdc, PRIBIO, "fdseek", hz); if (fdc_sense_int(fdc, &st0, &cyl) == FD_NOT_VALID) goto done; /* XXX */ *recal |= (1 << fd->fdsu); if (fdin_rd(fdc) & FDI_DCHG) { if (debugflags & 0x40) printf("Empty in probe\n"); mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_EMPTY; mtx_unlock(&fdc->fdc_mtx); } else { if (fdc_sense_drive(fdc, &st3) != 0) goto done; if (debugflags & 0x40) printf("Got disk in probe\n"); mtx_lock(&fdc->fdc_mtx); fd->flags &= ~FD_EMPTY; if (st3 & NE7_ST3_WP) fd->flags |= FD_WP; else fd->flags &= ~FD_WP; mtx_unlock(&fdc->fdc_mtx); } ret = 0; done: fd->options = oopts; return (ret); } static int fdmisccmd(struct fd_data *fd, u_int cmd, void *data) { struct bio *bp; struct fd_formb *finfo; struct fdc_readid *idfield; int error; bp = malloc(sizeof(struct bio), M_TEMP, M_WAITOK | M_ZERO); /* * Set up a bio request for fdstrategy(). bio_offset is faked * so that fdstrategy() will seek to the requested * cylinder, and use the desired head. */ bp->bio_cmd = cmd; if (cmd == BIO_FMT) { finfo = (struct fd_formb *)data; bp->bio_pblkno = (finfo->cyl * fd->ft->heads + finfo->head) * fd->ft->sectrac; bp->bio_length = sizeof *finfo; } else if (cmd == BIO_RDID) { idfield = (struct fdc_readid *)data; bp->bio_pblkno = (idfield->cyl * fd->ft->heads + idfield->head) * fd->ft->sectrac; bp->bio_length = sizeof(struct fdc_readid); } else if (cmd == BIO_PROBE) { /* nothing */ } else panic("wrong cmd in fdmisccmd()"); bp->bio_offset = bp->bio_pblkno * fd->sectorsize; bp->bio_data = data; bp->bio_driver1 = fd; bp->bio_flags = 0; fd_enqueue(fd, bp); do { tsleep(bp, PRIBIO, "fdwait", hz); } while (!(bp->bio_flags & BIO_DONE)); error = bp->bio_error; free(bp, M_TEMP); return (error); } /* * Try figuring out the density of the media present in our device. */ static int fdautoselect(struct fd_data *fd) { struct fd_type *fdtp; struct fdc_readid id; int oopts, rv; if (!(fd->ft->flags & FL_AUTO)) return (0); fdtp = fd_native_types[fd->type]; fdsettype(fd, fdtp); if (!(fd->ft->flags & FL_AUTO)) return (0); /* * Try reading sector ID fields, first at cylinder 0, head 0, * then at cylinder 2, head N. We don't probe cylinder 1, * since for 5.25in DD media in a HD drive, there are no data * to read (2 step pulses per media cylinder required). For * two-sided media, the second probe always goes to head 1, so * we can tell them apart from single-sided media. As a * side-effect this means that single-sided media should be * mentioned in the search list after two-sided media of an * otherwise identical density. Media with a different number * of sectors per track but otherwise identical parameters * cannot be distinguished at all. * * If we successfully read an ID field on both cylinders where * the recorded values match our expectation, we are done. * Otherwise, we try the next density entry from the table. * * Stepping to cylinder 2 has the side-effect of clearing the * unit attention bit. */ oopts = fd->options; fd->options |= FDOPT_NOERRLOG | FDOPT_NORETRY; for (; fdtp->heads; fdtp++) { fdsettype(fd, fdtp); id.cyl = id.head = 0; rv = fdmisccmd(fd, BIO_RDID, &id); if (rv != 0) continue; if (id.cyl != 0 || id.head != 0 || id.secshift != fdtp->secsize) continue; id.cyl = 2; id.head = fd->ft->heads - 1; rv = fdmisccmd(fd, BIO_RDID, &id); if (id.cyl != 2 || id.head != fdtp->heads - 1 || id.secshift != fdtp->secsize) continue; if (rv == 0) break; } fd->options = oopts; if (fdtp->heads == 0) { if (debugflags & 0x40) device_printf(fd->dev, "autoselection failed\n"); fdsettype(fd, fd_native_types[fd->type]); return (-1); } else { if (debugflags & 0x40) { device_printf(fd->dev, "autoselected %d KB medium\n", fd->ft->size / 2); fdprinttype(fd->ft); } return (0); } } /* * GEOM class implementation */ static g_access_t fd_access; static g_start_t fd_start; static g_ioctl_t fd_ioctl; struct g_class g_fd_class = { .name = "FD", .version = G_VERSION, .start = fd_start, .access = fd_access, .ioctl = fd_ioctl, }; static int fd_access(struct g_provider *pp, int r, int w, int e) { struct fd_data *fd; struct fdc_data *fdc; int ar, aw, ae; int busy; fd = pp->geom->softc; fdc = fd->fdc; /* * If our provider is withering, we can only get negative requests * and we don't want to even see them */ if (pp->flags & G_PF_WITHER) return (0); ar = r + pp->acr; aw = w + pp->acw; ae = e + pp->ace; if (ar == 0 && aw == 0 && ae == 0) { fd->options &= ~(FDOPT_NORETRY | FDOPT_NOERRLOG | FDOPT_NOERROR); device_unbusy(fd->dev); return (0); } busy = 0; if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) { if (fdmisccmd(fd, BIO_PROBE, NULL)) return (ENXIO); if (fd->flags & FD_EMPTY) return (ENXIO); if (fd->flags & FD_NEWDISK) { if (fdautoselect(fd) != 0 && (device_get_flags(fd->dev) & FD_NO_CHLINE)) { mtx_lock(&fdc->fdc_mtx); fd->flags |= FD_EMPTY; mtx_unlock(&fdc->fdc_mtx); return (ENXIO); } mtx_lock(&fdc->fdc_mtx); fd->flags &= ~FD_NEWDISK; mtx_unlock(&fdc->fdc_mtx); } device_busy(fd->dev); busy = 1; } if (w > 0 && (fd->flags & FD_WP)) { if (busy) device_unbusy(fd->dev); return (EROFS); } pp->sectorsize = fd->sectorsize; pp->stripesize = fd->ft->heads * fd->ft->sectrac * fd->sectorsize; pp->mediasize = pp->stripesize * fd->ft->tracks; return (0); } static void fd_start(struct bio *bp) { struct fdc_data * fdc; struct fd_data * fd; fd = bp->bio_to->geom->softc; fdc = fd->fdc; bp->bio_driver1 = fd; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "GEOM::fwsectors", fd->ft->sectrac)) return; if (g_handleattr_int(bp, "GEOM::fwheads", fd->ft->heads)) return; g_io_deliver(bp, ENOIOCTL); return; } if (!(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { g_io_deliver(bp, EOPNOTSUPP); return; } bp->bio_pblkno = bp->bio_offset / fd->sectorsize; bp->bio_resid = bp->bio_length; fd_enqueue(fd, bp); return; } static int fd_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct fd_data *fd; struct fdc_status *fsp; struct fdc_readid *rid; int error; fd = pp->geom->softc; switch (cmd) { case FD_GTYPE: /* get drive type */ *(struct fd_type *)data = *fd->ft; return (0); case FD_STYPE: /* set drive type */ /* * Allow setting drive type temporarily iff * currently unset. Used for fdformat so any * user can set it, and then start formatting. */ fd->fts = *(struct fd_type *)data; if (fd->fts.sectrac) { /* XXX: check for rubbish */ fdsettype(fd, &fd->fts); } else { fdsettype(fd, fd_native_types[fd->type]); } if (debugflags & 0x40) fdprinttype(fd->ft); return (0); case FD_GOPTS: /* get drive options */ *(int *)data = fd->options; return (0); case FD_SOPTS: /* set drive options */ fd->options = *(int *)data; return (0); case FD_CLRERR: error = priv_check(td, PRIV_DRIVER); if (error) return (error); fd->fdc->fdc_errs = 0; return (0); case FD_GSTAT: fsp = (struct fdc_status *)data; if ((fd->fdc->flags & FDC_STAT_VALID) == 0) return (EINVAL); memcpy(fsp->status, fd->fdc->status, 7 * sizeof(u_int)); return (0); case FD_GDTYPE: *(enum fd_drivetype *)data = fd->type; return (0); case FD_FORM: if (!(fflag & FWRITE)) return (EPERM); if (((struct fd_formb *)data)->format_version != FD_FORMAT_VERSION) return (EINVAL); /* wrong version of formatting prog */ error = fdmisccmd(fd, BIO_FMT, data); mtx_lock(&fd->fdc->fdc_mtx); fd->flags |= FD_NEWDISK; mtx_unlock(&fd->fdc->fdc_mtx); break; case FD_READID: rid = (struct fdc_readid *)data; if (rid->cyl > 85 || rid->head > 1) return (EINVAL); error = fdmisccmd(fd, BIO_RDID, data); break; case FIONBIO: case FIOASYNC: /* For backwards compat with old fd*(8) tools */ error = 0; break; default: if (debugflags & 0x80) printf("Unknown ioctl %lx\n", cmd); error = ENOIOCTL; break; } return (error); }; /* * Configuration/initialization stuff, per controller. */ devclass_t fdc_devclass; static devclass_t fd_devclass; struct fdc_ivars { int fdunit; int fdtype; }; void fdc_release_resources(struct fdc_data *fdc) { device_t dev; struct resource *last; int i; dev = fdc->fdc_dev; if (fdc->fdc_intr) bus_teardown_intr(dev, fdc->res_irq, fdc->fdc_intr); fdc->fdc_intr = NULL; if (fdc->res_irq != NULL) bus_release_resource(dev, SYS_RES_IRQ, fdc->rid_irq, fdc->res_irq); fdc->res_irq = NULL; last = NULL; for (i = 0; i < FDC_MAXREG; i++) { if (fdc->resio[i] != NULL && fdc->resio[i] != last) { bus_release_resource(dev, SYS_RES_IOPORT, fdc->ridio[i], fdc->resio[i]); last = fdc->resio[i]; fdc->resio[i] = NULL; } } if (fdc->res_drq != NULL) bus_release_resource(dev, SYS_RES_DRQ, fdc->rid_drq, fdc->res_drq); fdc->res_drq = NULL; } int fdc_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) { struct fdc_ivars *ivars = device_get_ivars(child); switch (which) { case FDC_IVAR_FDUNIT: *result = ivars->fdunit; break; case FDC_IVAR_FDTYPE: *result = ivars->fdtype; break; default: return (ENOENT); } return (0); } int fdc_write_ivar(device_t dev, device_t child, int which, uintptr_t value) { struct fdc_ivars *ivars = device_get_ivars(child); switch (which) { case FDC_IVAR_FDUNIT: ivars->fdunit = value; break; case FDC_IVAR_FDTYPE: ivars->fdtype = value; break; default: return (ENOENT); } return (0); } int fdc_initial_reset(device_t dev, struct fdc_data *fdc) { int ic_type, part_id; /* * A status value of 0xff is very unlikely, but not theoretically * impossible, but it is far more likely to indicate an empty bus. */ if (fdsts_rd(fdc) == 0xff) return (ENXIO); /* * Assert a reset to the floppy controller and check that the status * register goes to zero. */ fdout_wr(fdc, 0); fdout_wr(fdc, 0); if (fdsts_rd(fdc) != 0) return (ENXIO); /* * Clear the reset and see it come ready. */ fdout_wr(fdc, FDO_FRST); DELAY(100); if (fdsts_rd(fdc) != 0x80) return (ENXIO); /* Then, see if it can handle a command. */ if (fdc_cmd(fdc, 3, NE7CMD_SPECIFY, NE7_SPEC_1(6, 240), NE7_SPEC_2(31, 0), 0)) return (ENXIO); /* * Try to identify the chip. * * The i8272 datasheet documents that unknown commands * will return ST0 as 0x80. The i8272 is supposedly identical * to the NEC765. * The i82077SL datasheet says 0x90 for the VERSION command, * and several "superio" chips emulate this. */ if (fdc_cmd(fdc, 1, NE7CMD_VERSION, 1, &ic_type)) return (ENXIO); if (fdc_cmd(fdc, 1, 0x18, 1, &part_id)) return (ENXIO); if (bootverbose) device_printf(dev, "ic_type %02x part_id %02x\n", ic_type, part_id); switch (ic_type & 0xff) { case 0x80: device_set_desc(dev, "NEC 765 or clone"); fdc->fdct = FDC_NE765; break; case 0x81: case 0x90: device_set_desc(dev, "Enhanced floppy controller"); fdc->fdct = FDC_ENHANCED; break; default: device_set_desc(dev, "Generic floppy controller"); fdc->fdct = FDC_UNKNOWN; break; } return (0); } int fdc_detach(device_t dev) { struct fdc_data *fdc; int error; fdc = device_get_softc(dev); /* have our children detached first */ if ((error = bus_generic_detach(dev))) return (error); if (fdc->fdc_intr) bus_teardown_intr(dev, fdc->res_irq, fdc->fdc_intr); fdc->fdc_intr = NULL; /* kill worker thread */ mtx_lock(&fdc->fdc_mtx); fdc->flags |= FDC_KTHREAD_EXIT; wakeup(&fdc->head); while ((fdc->flags & FDC_KTHREAD_ALIVE) != 0) msleep(fdc->fdc_thread, &fdc->fdc_mtx, PRIBIO, "fdcdet", 0); mtx_unlock(&fdc->fdc_mtx); /* reset controller, turn motor off */ fdout_wr(fdc, 0); if (!(fdc->flags & FDC_NODMA)) isa_dma_release(fdc->dmachan); fdc_release_resources(fdc); mtx_destroy(&fdc->fdc_mtx); return (0); } /* * Add a child device to the fdc controller. It will then be probed etc. */ device_t fdc_add_child(device_t dev, const char *name, int unit) { struct fdc_ivars *ivar; device_t child; ivar = malloc(sizeof *ivar, M_DEVBUF /* XXX */, M_NOWAIT | M_ZERO); if (ivar == NULL) return (NULL); child = device_add_child(dev, name, unit); if (child == NULL) { free(ivar, M_DEVBUF); return (NULL); } device_set_ivars(child, ivar); ivar->fdunit = unit; ivar->fdtype = FDT_NONE; if (resource_disabled(name, unit)) device_disable(child); return (child); } int fdc_attach(device_t dev) { struct fdc_data *fdc; int error; fdc = device_get_softc(dev); fdc->fdc_dev = dev; error = fdc_initial_reset(dev, fdc); if (error) { device_printf(dev, "does not respond\n"); return (error); } error = bus_setup_intr(dev, fdc->res_irq, INTR_TYPE_BIO | INTR_ENTROPY | ((fdc->flags & FDC_NOFAST) ? INTR_MPSAFE : 0), ((fdc->flags & FDC_NOFAST) ? NULL : fdc_intr_fast), ((fdc->flags & FDC_NOFAST) ? fdc_intr : NULL), fdc, &fdc->fdc_intr); if (error) { device_printf(dev, "cannot setup interrupt\n"); return (error); } if (!(fdc->flags & FDC_NODMA)) { error = isa_dma_acquire(fdc->dmachan); if (!error) { error = isa_dma_init(fdc->dmachan, MAX_BYTES_PER_CYL, M_WAITOK); if (error) isa_dma_release(fdc->dmachan); } if (error) return (error); } fdc->fdcu = device_get_unit(dev); fdc->flags |= FDC_NEEDS_RESET; mtx_init(&fdc->fdc_mtx, "fdc lock", NULL, MTX_DEF); /* reset controller, turn motor off, clear fdout mirror reg */ fdout_wr(fdc, fdc->fdout = 0); bioq_init(&fdc->head); settle = hz / 8; return (0); } void fdc_start_worker(device_t dev) { struct fdc_data *fdc; fdc = device_get_softc(dev); kproc_create(fdc_thread, fdc, &fdc->fdc_thread, 0, 0, "fdc%d", device_get_unit(dev)); } int fdc_hints_probe(device_t dev) { const char *name, *dname; int i, error, dunit; /* * Probe and attach any children. We should probably detect * devices from the BIOS unless overridden. */ name = device_get_nameunit(dev); i = 0; while ((resource_find_match(&i, &dname, &dunit, "at", name)) == 0) { resource_int_value(dname, dunit, "drive", &dunit); fdc_add_child(dev, dname, dunit); } if ((error = bus_generic_attach(dev)) != 0) return (error); return (0); } int fdc_print_child(device_t me, device_t child) { int retval = 0, flags; retval += bus_print_child_header(me, child); retval += printf(" on %s drive %d", device_get_nameunit(me), fdc_get_fdunit(child)); if ((flags = device_get_flags(me)) != 0) retval += printf(" flags %#x", flags); retval += printf("\n"); return (retval); } /* * Configuration/initialization, per drive. */ static int fd_probe(device_t dev) { int unit; int i; u_int st0, st3; struct fd_data *fd; struct fdc_data *fdc; int fdsu; int flags, type; fdsu = fdc_get_fdunit(dev); fd = device_get_softc(dev); fdc = device_get_softc(device_get_parent(dev)); flags = device_get_flags(dev); fd->dev = dev; fd->fdc = fdc; fd->fdsu = fdsu; unit = device_get_unit(dev); /* Auto-probe if fdinfo is present, but always allow override. */ type = flags & FD_TYPEMASK; if (type == FDT_NONE && (type = fdc_get_fdtype(dev)) != FDT_NONE) { fd->type = type; goto done; } else { /* make sure fdautoselect() will be called */ fd->flags = FD_EMPTY; fd->type = type; } #if defined(__i386__) || defined(__amd64__) if (fd->type == FDT_NONE && (unit == 0 || unit == 1)) { /* Look up what the BIOS thinks we have. */ if (unit == 0) fd->type = (rtcin(RTC_FDISKETTE) & 0xf0) >> 4; else fd->type = rtcin(RTC_FDISKETTE) & 0x0f; if (fd->type == FDT_288M_1) fd->type = FDT_288M; } #endif /* __i386__ || __amd64__ */ /* is there a unit? */ if (fd->type == FDT_NONE) return (ENXIO); mtx_lock(&fdc->fdc_mtx); /* select it */ fd_select(fd); fd_motor(fd, 1); fdc->fd = fd; fdc_reset(fdc); /* XXX reset, then unreset, etc. */ DELAY(1000000); /* 1 sec */ if ((flags & FD_NO_PROBE) == 0) { /* If we're at track 0 first seek inwards. */ if ((fdc_sense_drive(fdc, &st3) == 0) && (st3 & NE7_ST3_T0)) { /* Seek some steps... */ if (fdc_cmd(fdc, 3, NE7CMD_SEEK, fdsu, 10, 0) == 0) { /* ...wait a moment... */ DELAY(300000); /* make ctrlr happy: */ fdc_sense_int(fdc, NULL, NULL); } } for (i = 0; i < 2; i++) { /* * we must recalibrate twice, just in case the * heads have been beyond cylinder 76, since * most FDCs still barf when attempting to * recalibrate more than 77 steps */ /* go back to 0: */ if (fdc_cmd(fdc, 2, NE7CMD_RECAL, fdsu, 0) == 0) { /* a second being enough for full stroke seek*/ DELAY(i == 0 ? 1000000 : 300000); /* anything responding? */ if (fdc_sense_int(fdc, &st0, NULL) == 0 && (st0 & NE7_ST0_EC) == 0) break; /* already probed successfully */ } } } fd_motor(fd, 0); fdc->fd = NULL; mtx_unlock(&fdc->fdc_mtx); if ((flags & FD_NO_PROBE) == 0 && (st0 & NE7_ST0_EC) != 0) /* no track 0 -> no drive present */ return (ENXIO); done: switch (fd->type) { case FDT_12M: device_set_desc(dev, "1200-KB 5.25\" drive"); break; case FDT_144M: device_set_desc(dev, "1440-KB 3.5\" drive"); break; case FDT_288M: device_set_desc(dev, "2880-KB 3.5\" drive (in 1440-KB mode)"); break; case FDT_360K: device_set_desc(dev, "360-KB 5.25\" drive"); break; case FDT_720K: device_set_desc(dev, "720-KB 3.5\" drive"); break; default: return (ENXIO); } fd->track = FD_NO_TRACK; fd->fdc = fdc; fd->fdsu = fdsu; fd->options = 0; callout_init_mtx(&fd->toffhandle, &fd->fdc->fdc_mtx, 0); /* initialize densities for subdevices */ fdsettype(fd, fd_native_types[fd->type]); return (0); } /* * We have to do this in a geom event because GEOM is not running * when fd_attach() is. * XXX: move fd_attach after geom like ata/scsi disks */ static void fd_attach2(void *arg, int flag) { struct fd_data *fd; fd = arg; fd->fd_geom = g_new_geomf(&g_fd_class, "fd%d", device_get_unit(fd->dev)); fd->fd_provider = g_new_providerf(fd->fd_geom, "%s", fd->fd_geom->name); fd->fd_geom->softc = fd; g_error_provider(fd->fd_provider, 0); } static int fd_attach(device_t dev) { struct fd_data *fd; fd = device_get_softc(dev); g_post_event(fd_attach2, fd, M_WAITOK, NULL); fd->flags |= FD_EMPTY; bioq_init(&fd->fd_bq); return (0); } static void fd_detach_geom(void *arg, int flag) { struct fd_data *fd = arg; g_topology_assert(); g_wither_geom(fd->fd_geom, ENXIO); } static int fd_detach(device_t dev) { struct fd_data *fd; fd = device_get_softc(dev); g_waitfor_event(fd_detach_geom, fd, M_WAITOK, NULL); while (device_get_state(dev) == DS_BUSY) tsleep(fd, PZERO, "fdd", hz/10); callout_drain(&fd->toffhandle); return (0); } static device_method_t fd_methods[] = { /* Device interface */ DEVMETHOD(device_probe, fd_probe), DEVMETHOD(device_attach, fd_attach), DEVMETHOD(device_detach, fd_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), /* XXX */ DEVMETHOD(device_resume, bus_generic_resume), /* XXX */ { 0, 0 } }; static driver_t fd_driver = { "fd", fd_methods, sizeof(struct fd_data) }; static int fdc_modevent(module_t mod, int type, void *data) { return (g_modevent(NULL, type, &g_fd_class)); } DRIVER_MODULE(fd, fdc, fd_driver, fd_devclass, fdc_modevent, 0); Index: head/sys/geom/cache/g_cache.c =================================================================== --- head/sys/geom/cache/g_cache.c (revision 350693) +++ head/sys/geom/cache/g_cache.c (revision 350694) @@ -1,1019 +1,1020 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_cache, "GEOM cache module"); static MALLOC_DEFINE(M_GCACHE, "gcache_data", "GEOM_CACHE Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, cache, CTLFLAG_RW, 0, "GEOM_CACHE stuff"); static u_int g_cache_debug = 0; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, debug, CTLFLAG_RW, &g_cache_debug, 0, "Debug level"); static u_int g_cache_enable = 1; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, enable, CTLFLAG_RW, &g_cache_enable, 0, ""); static u_int g_cache_timeout = 10; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, timeout, CTLFLAG_RW, &g_cache_timeout, 0, ""); static u_int g_cache_idletime = 5; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, idletime, CTLFLAG_RW, &g_cache_idletime, 0, ""); static u_int g_cache_used_lo = 5; static u_int g_cache_used_hi = 20; static int sysctl_handle_pct(SYSCTL_HANDLER_ARGS) { u_int val = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val > 100) return (EINVAL); if ((arg1 == &g_cache_used_lo && val > g_cache_used_hi) || (arg1 == &g_cache_used_hi && g_cache_used_lo > val)) return (EINVAL); *(u_int *)arg1 = val; return (0); } SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_lo, CTLTYPE_UINT|CTLFLAG_RW, &g_cache_used_lo, 0, sysctl_handle_pct, "IU", ""); SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_hi, CTLTYPE_UINT|CTLFLAG_RW, &g_cache_used_hi, 0, sysctl_handle_pct, "IU", ""); static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force); static g_ctl_destroy_geom_t g_cache_destroy_geom; static g_taste_t g_cache_taste; static g_ctl_req_t g_cache_config; static g_dumpconf_t g_cache_dumpconf; struct g_class g_cache_class = { .name = G_CACHE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_cache_config, .taste = g_cache_taste, .destroy_geom = g_cache_destroy_geom }; #define OFF2BNO(off, sc) ((off) >> (sc)->sc_bshift) #define BNO2OFF(bno, sc) ((bno) << (sc)->sc_bshift) static struct g_cache_desc * g_cache_alloc(struct g_cache_softc *sc) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); if (!TAILQ_EMPTY(&sc->sc_usedlist)) { dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; dp->d_flags = 0; LIST_REMOVE(dp, d_next); return (dp); } if (sc->sc_nent > sc->sc_maxent) { sc->sc_cachefull++; return (NULL); } dp = malloc(sizeof(*dp), M_GCACHE, M_NOWAIT | M_ZERO); if (dp == NULL) return (NULL); dp->d_data = uma_zalloc(sc->sc_zone, M_NOWAIT); if (dp->d_data == NULL) { free(dp, M_GCACHE); return (NULL); } sc->sc_nent++; return (dp); } static void g_cache_free(struct g_cache_softc *sc, struct g_cache_desc *dp) { mtx_assert(&sc->sc_mtx, MA_OWNED); uma_zfree(sc->sc_zone, dp->d_data); free(dp, M_GCACHE); sc->sc_nent--; } static void g_cache_free_used(struct g_cache_softc *sc) { struct g_cache_desc *dp; u_int n; mtx_assert(&sc->sc_mtx, MA_OWNED); n = g_cache_used_lo * sc->sc_maxent / 100; while (sc->sc_nused > n) { KASSERT(!TAILQ_EMPTY(&sc->sc_usedlist), ("used list empty")); dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; LIST_REMOVE(dp, d_next); g_cache_free(sc, dp); } } static void g_cache_deliver(struct g_cache_softc *sc, struct bio *bp, struct g_cache_desc *dp, int error) { off_t off1, off, len; mtx_assert(&sc->sc_mtx, MA_OWNED); KASSERT(OFF2BNO(bp->bio_offset, sc) <= dp->d_bno, ("wrong entry")); KASSERT(OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc) >= dp->d_bno, ("wrong entry")); off1 = BNO2OFF(dp->d_bno, sc); off = MAX(bp->bio_offset, off1); len = MIN(bp->bio_offset + bp->bio_length, off1 + sc->sc_bsize) - off; if (bp->bio_error == 0) bp->bio_error = error; if (bp->bio_error == 0) { bcopy(dp->d_data + (off - off1), bp->bio_data + (off - bp->bio_offset), len); } bp->bio_completed += len; KASSERT(bp->bio_completed <= bp->bio_length, ("extra data")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); } else if (OFF2BNO(off + len, sc) > dp->d_bno) { TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } dp->d_atime = time_uptime; } static void g_cache_done(struct bio *bp) { struct g_cache_softc *sc; struct g_cache_desc *dp; struct bio *bp2, *tmpbp; sc = bp->bio_from->geom->softc; KASSERT(G_CACHE_DESC1(bp) == sc, ("corrupt bio_caller in g_cache_done()")); dp = G_CACHE_DESC2(bp); mtx_lock(&sc->sc_mtx); bp2 = dp->d_biolist; while (bp2 != NULL) { KASSERT(G_CACHE_NEXT_BIO1(bp2) == sc, ("corrupt bio_driver in g_cache_done()")); tmpbp = G_CACHE_NEXT_BIO2(bp2); g_cache_deliver(sc, bp2, dp, bp->bio_error); bp2 = tmpbp; } dp->d_biolist = NULL; if (dp->d_flags & D_FLAG_INVALID) { sc->sc_invalid--; g_cache_free(sc, dp); } else if (bp->bio_error) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } g_cache_free(sc, dp); } mtx_unlock(&sc->sc_mtx); g_destroy_bio(bp); } static struct g_cache_desc * g_cache_lookup(struct g_cache_softc *sc, off_t bno) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); LIST_FOREACH(dp, &sc->sc_desclist[G_CACHE_BUCKET(bno)], d_next) if (dp->d_bno == bno) return (dp); return (NULL); } static int g_cache_read(struct g_cache_softc *sc, struct bio *bp) { struct bio *cbp; struct g_cache_desc *dp; mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset + bp->bio_completed, sc)); if (dp != NULL) { /* Add to waiters list or deliver. */ sc->sc_cachehits++; if (dp->d_biolist != NULL) { G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = dp->d_biolist; dp->d_biolist = bp; } else g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); return (0); } /* Cache miss. Allocate entry and schedule bio. */ sc->sc_cachemisses++; dp = g_cache_alloc(sc); if (dp == NULL) { mtx_unlock(&sc->sc_mtx); return (ENOMEM); } cbp = g_clone_bio(bp); if (cbp == NULL) { g_cache_free(sc, dp); mtx_unlock(&sc->sc_mtx); return (ENOMEM); } dp->d_bno = OFF2BNO(bp->bio_offset + bp->bio_completed, sc); G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = NULL; dp->d_biolist = bp; LIST_INSERT_HEAD(&sc->sc_desclist[G_CACHE_BUCKET(dp->d_bno)], dp, d_next); mtx_unlock(&sc->sc_mtx); G_CACHE_DESC1(cbp) = sc; G_CACHE_DESC2(cbp) = dp; cbp->bio_done = g_cache_done; cbp->bio_offset = BNO2OFF(dp->d_bno, sc); cbp->bio_data = dp->d_data; cbp->bio_length = sc->sc_bsize; g_io_request(cbp, LIST_FIRST(&bp->bio_to->geom->consumer)); return (0); } static void g_cache_invalidate(struct g_cache_softc *sc, struct bio *bp) { struct g_cache_desc *dp; off_t bno, lim; mtx_lock(&sc->sc_mtx); bno = OFF2BNO(bp->bio_offset, sc); lim = OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc); do { if ((dp = g_cache_lookup(sc, bno)) != NULL) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } if (dp->d_biolist == NULL) g_cache_free(sc, dp); else { dp->d_flags = D_FLAG_INVALID; sc->sc_invalid++; } } bno++; } while (bno <= lim); mtx_unlock(&sc->sc_mtx); } static void g_cache_start(struct bio *bp) { struct g_cache_softc *sc; struct g_geom *gp; struct g_cache_desc *dp; struct bio *cbp; gp = bp->bio_to->geom; sc = gp->softc; G_CACHE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; if (!g_cache_enable) break; if (bp->bio_offset + bp->bio_length > sc->sc_tail) break; if (OFF2BNO(bp->bio_offset, sc) == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } else if (OFF2BNO(bp->bio_offset, sc) + 1 == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset, sc)); if (dp == NULL || dp->d_biolist != NULL) { mtx_unlock(&sc->sc_mtx); break; } sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; g_cache_invalidate(sc, bp); break; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; G_CACHE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_cache_go(void *arg) { struct g_cache_softc *sc = arg; struct g_cache_desc *dp; int i; mtx_assert(&sc->sc_mtx, MA_OWNED); /* Forcibly mark idle ready entries as used. */ for (i = 0; i < G_CACHE_BUCKETS; i++) { LIST_FOREACH(dp, &sc->sc_desclist[i], d_next) { if (dp->d_flags & D_FLAG_USED || dp->d_biolist != NULL || time_uptime - dp->d_atime < g_cache_idletime) continue; TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } } /* Keep the number of used entries low. */ if (sc->sc_nused > g_cache_used_hi * sc->sc_maxent / 100) g_cache_free_used(sc); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); } static int g_cache_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_cache_orphan(struct g_consumer *cp) { g_topology_assert(); g_cache_destroy(cp->geom->softc, 1); } static struct g_cache_softc * g_cache_find_device(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp->softc); } return (NULL); } static struct g_geom * g_cache_create(struct g_class *mp, struct g_provider *pp, const struct g_cache_metadata *md, u_int type) { struct g_cache_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; u_int bshift; int i; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; G_CACHE_DEBUG(1, "Creating device %s.", md->md_name); /* Cache size is minimum 100. */ if (md->md_size < 100) { G_CACHE_DEBUG(0, "Invalid size for device %s.", md->md_name); return (NULL); } /* Block size restrictions. */ bshift = ffs(md->md_bsize) - 1; if (md->md_bsize == 0 || md->md_bsize > MAXPHYS || md->md_bsize != 1 << bshift || (md->md_bsize % pp->sectorsize) != 0) { G_CACHE_DEBUG(0, "Invalid blocksize for provider %s.", pp->name); return (NULL); } /* Check for duplicate unit. */ if (g_cache_find_device(mp, (const char *)&md->md_name) != NULL) { G_CACHE_DEBUG(0, "Provider %s already exists.", md->md_name); return (NULL); } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_type = type; sc->sc_bshift = bshift; sc->sc_bsize = 1 << bshift; sc->sc_zone = uma_zcreate("gcache", sc->sc_bsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sc->sc_mtx, "GEOM CACHE mutex", NULL, MTX_DEF); for (i = 0; i < G_CACHE_BUCKETS; i++) LIST_INIT(&sc->sc_desclist[i]); TAILQ_INIT(&sc->sc_usedlist); sc->sc_maxent = md->md_size; callout_init_mtx(&sc->sc_callout, &sc->sc_mtx, 0); gp->softc = sc; sc->sc_geom = gp; gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; gp->dumpconf = g_cache_dumpconf; newpp = g_new_providerf(gp, "cache/%s", gp->name); newpp->sectorsize = pp->sectorsize; newpp->mediasize = pp->mediasize; if (type == G_CACHE_TYPE_AUTOMATIC) newpp->mediasize -= pp->sectorsize; sc->sc_tail = BNO2OFF(OFF2BNO(newpp->mediasize, sc), sc); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { G_CACHE_DEBUG(0, "Cannot attach to provider %s.", pp->name); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_mtx); g_free(sc); g_destroy_geom(gp); return (NULL); } g_error_provider(newpp, 0); G_CACHE_DEBUG(0, "Device %s created.", gp->name); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); return (gp); } static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; struct g_cache_desc *dp, *dp2; int i; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CACHE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CACHE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_CACHE_DEBUG(0, "Device %s removed.", gp->name); } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); for (i = 0; i < G_CACHE_BUCKETS; i++) { dp = LIST_FIRST(&sc->sc_desclist[i]); while (dp != NULL) { dp2 = LIST_NEXT(dp, d_next); g_cache_free(sc, dp); dp = dp2; } } mtx_unlock(&sc->sc_mtx); mtx_destroy(&sc->sc_mtx); uma_zdestroy(sc->sc_zone); g_free(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_cache_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_cache_destroy(gp->softc, 0)); } static int g_cache_read_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ cache_metadata_decode(buf, md); g_free(buf); return (0); } static int g_cache_write_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) return (error); pp = cp->provider; buf = malloc((size_t)pp->sectorsize, M_GCACHE, M_WAITOK | M_ZERO); cache_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GCACHE); return (error); } static struct g_geom * g_cache_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_cache_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_CACHE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "cache:taste"); gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_cache_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_CACHE_MAGIC) != 0) return (NULL); if (md.md_version > G_CACHE_VERSION) { printf("geom_cache.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_AUTOMATIC); if (gp == NULL) { G_CACHE_DEBUG(0, "Can't create %s.", md.md_name); return (NULL); } return (gp); } static void g_cache_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_provider *pp; struct g_geom *gp; intmax_t *bsize, *size; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } md.md_size = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } md.md_bsize = (u_int)*bsize; /* This field is not important here. */ md.md_provsize = 0; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_CACHE_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); return; } } static void g_cache_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_cache_softc *sc; struct g_consumer *cp; intmax_t *bsize, *size; const char *name; int error, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size != 0 && (u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } if ((u_int)*size != 0) sc->sc_maxent = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } if (sc->sc_type != G_CACHE_TYPE_AUTOMATIC) return; strlcpy(md.md_name, name, sizeof(md.md_name)); strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; if ((u_int)*size != 0) md.md_size = (u_int)*size; else md.md_size = sc->sc_maxent; if ((u_int)*bsize != 0) md.md_bsize = (u_int)*bsize; else md.md_bsize = sc->sc_bsize; cp = LIST_FIRST(&sc->sc_geom->consumer); md.md_provsize = cp->provider->mediasize; error = g_cache_write_metadata(cp, &md); if (error == 0) G_CACHE_DEBUG(2, "Metadata on %s updated.", cp->provider->name); else G_CACHE_DEBUG(0, "Cannot update metadata on %s (error=%d).", cp->provider->name, error); } static void g_cache_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_cache_softc *sc; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_cache_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_cache_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_cache_softc *sc; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } sc->sc_reads = 0; sc->sc_readbytes = 0; sc->sc_cachereads = 0; sc->sc_cachereadbytes = 0; sc->sc_cachehits = 0; sc->sc_cachemisses = 0; sc->sc_cachefull = 0; sc->sc_writes = 0; sc->sc_wrotebytes = 0; } } static void g_cache_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CACHE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_cache_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_cache_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_cache_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_cache_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_cache_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_cache_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%u\n", indent, sc->sc_maxent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_bsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_tail); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nused); sbuf_printf(sb, "%s%u\n", indent, sc->sc_invalid); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereadbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachehits); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachemisses); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachefull); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_cache_class, g_cache); MODULE_VERSION(geom_cache, 0); Index: head/sys/geom/cache/g_cache.h =================================================================== --- head/sys/geom/cache/g_cache.h (revision 350693) +++ head/sys/geom/cache/g_cache.h (revision 350694) @@ -1,148 +1,133 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_CACHE_H_ #define _G_CACHE_H_ #include #define G_CACHE_CLASS_NAME "CACHE" #define G_CACHE_MAGIC "GEOM::CACHE" #define G_CACHE_VERSION 1 #ifdef _KERNEL #define G_CACHE_TYPE_MANUAL 0 #define G_CACHE_TYPE_AUTOMATIC 1 -#define G_CACHE_DEBUG(lvl, ...) do { \ - if (g_cache_debug >= (lvl)) { \ - printf("GEOM_CACHE"); \ - if (g_cache_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_CACHE_LOGREQ(bp, ...) do { \ - if (g_cache_debug >= 2) { \ - printf("GEOM_CACHE[2]: "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_CACHE_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_CACHE", g_cache_debug, (lvl), NULL, __VA_ARGS__) +#define G_CACHE_LOGREQ(bp, ...) \ + _GEOM_DEBUG("GEOM_CACHE", g_cache_debug, 2, (bp), __VA_ARGS__) #define G_CACHE_BUCKETS (1 << 3) #define G_CACHE_BUCKET(bno) ((bno) & (G_CACHE_BUCKETS - 1)) struct g_cache_softc { struct g_geom *sc_geom; int sc_type; u_int sc_bshift; u_int sc_bsize; off_t sc_tail; struct mtx sc_mtx; struct callout sc_callout; LIST_HEAD(, g_cache_desc) sc_desclist[G_CACHE_BUCKETS]; TAILQ_HEAD(, g_cache_desc) sc_usedlist; uma_zone_t sc_zone; u_int sc_maxent; /* max entries */ u_int sc_nent; /* allocated entries */ u_int sc_nused; /* re-useable entries */ u_int sc_invalid; /* invalid entries */ uintmax_t sc_reads; /* #reads */ uintmax_t sc_readbytes; /* bytes read */ uintmax_t sc_cachereads; /* #reads from cache */ uintmax_t sc_cachereadbytes; /* bytes read from cache */ uintmax_t sc_cachehits; /* cache hits */ uintmax_t sc_cachemisses; /* cache misses */ uintmax_t sc_cachefull; /* #times a cache was full */ uintmax_t sc_writes; /* #writes */ uintmax_t sc_wrotebytes; /* bytes written */ }; #define sc_name sc_geom->name struct g_cache_desc { off_t d_bno; /* block number */ caddr_t d_data; /* data area */ struct bio *d_biolist; /* waiters */ time_t d_atime; /* access time */ int d_flags; /* flags */ #define D_FLAG_USED (1 << 0) /* can be reused */ #define D_FLAG_INVALID (1 << 1) /* invalid */ LIST_ENTRY(g_cache_desc) d_next; /* list */ TAILQ_ENTRY(g_cache_desc) d_used; /* used list */ }; #define G_CACHE_NEXT_BIO1(bp) (bp)->bio_driver1 #define G_CACHE_NEXT_BIO2(bp) (bp)->bio_driver2 #define G_CACHE_DESC1(bp) (bp)->bio_caller1 #define G_CACHE_DESC2(bp) (bp)->bio_caller2 #endif /* _KERNEL */ struct g_cache_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Cache value. */ uint32_t md_bsize; /* Cache block size. */ uint32_t md_size; /* Cache size. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void cache_metadata_encode(const struct g_cache_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_bsize); le32enc(data + 40, md->md_size); le64enc(data + 44, md->md_provsize); } static __inline void cache_metadata_decode(const u_char *data, struct g_cache_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_bsize = le32dec(data + 36); md->md_size = le32dec(data + 40); md->md_provsize = le64dec(data + 44); } #endif /* _G_CACHE_H_ */ Index: head/sys/geom/concat/g_concat.c =================================================================== --- head/sys/geom/concat/g_concat.c (revision 350693) +++ head/sys/geom/concat/g_concat.c (revision 350694) @@ -1,1029 +1,1030 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_concat, "GEOM concatenation support"); static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW, 0, "GEOM_CONCAT stuff"); static u_int g_concat_debug = 0; SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0, "Debug level"); static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force); static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_concat_taste; static g_ctl_req_t g_concat_config; static g_dumpconf_t g_concat_dumpconf; struct g_class g_concat_class = { .name = G_CONCAT_CLASS_NAME, .version = G_VERSION, .ctlreq = g_concat_config, .taste = g_concat_taste, .destroy_geom = g_concat_destroy_geom }; /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } /* * Return the number of valid disks. */ static u_int g_concat_nvalid(struct g_concat_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_consumer != NULL) no++; } return (no); } static void g_concat_remove_disk(struct g_concat_disk *disk) { struct g_consumer *cp; struct g_concat_softc *sc; g_topology_assert(); KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__)); sc = disk->d_softc; cp = disk->d_consumer; if (!disk->d_removed) { G_CONCAT_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); disk->d_removed = 1; } if (sc->sc_provider != NULL) { G_CONCAT_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; disk->d_consumer = NULL; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_concat_destroy(sc, 1); } static void g_concat_orphan(struct g_consumer *cp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; disk = cp->private; if (disk == NULL) /* Possible? */ return; g_concat_remove_disk(disk); } static int g_concat_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_concat_disk *disk; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; disk = cp1->private; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && disk->d_removed) { g_concat_remove_disk(disk); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_concat_candelete(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; int i, val; sc = bp->bio_to->geom->softc; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (!disk->d_removed && disk->d_candelete) break; } val = i < sc->sc_ndisks; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_concat_kernel_dump(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; u_int i; sc = bp->bio_to->geom->softc; gkd = (struct g_kerneldump *)bp->bio_data; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_start <= gkd->offset && sc->sc_disks[i].d_end > gkd->offset) break; } if (i == sc->sc_ndisks) { g_io_deliver(bp, EOPNOTSUPP); return; } disk = &sc->sc_disks[i]; gkd->offset -= disk->d_start; if (gkd->length > disk->d_end - disk->d_start - gkd->offset) gkd->length = disk->d_end - disk->d_start - gkd->offset; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_CONCAT_DEBUG(1, "Kernel dump will go to %s.", disk->d_consumer->provider->name); } static void g_concat_done(struct bio *bp) { struct g_concat_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static void g_concat_flush(struct g_concat_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_concat_done; cbp->bio_caller1 = sc->sc_disks[no].d_consumer; cbp->bio_to = sc->sc_disks[no].d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, cp); } } static void g_concat_start(struct bio *bp) { struct bio_queue_head queue; struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_provider *pp; off_t offset, end, length, off, len; struct bio *cbp; char *addr; u_int no; pp = bp->bio_to; sc = pp->geom->softc; /* * If sc == NULL, provider's error should be set and g_concat_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_CONCAT_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_concat_flush(sc, bp); return; case BIO_GETATTR: if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_concat_kernel_dump(bp); return; } else if (strcmp("GEOM::candelete", bp->bio_attribute) == 0) { g_concat_candelete(bp); return; } /* To which provider it should be delivered? */ /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } offset = bp->bio_offset; length = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; end = offset + length; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; if (disk->d_end <= offset) continue; if (disk->d_start >= end) break; off = offset - disk->d_start; len = MIN(length, disk->d_end - offset); length -= len; offset += len; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); /* * Fill in the component buf structure. */ if (len == bp->bio_length) cbp->bio_done = g_std_done; else cbp->bio_done = g_concat_done; cbp->bio_offset = off; cbp->bio_length = len; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; addr += len; cbp->bio_to = disk->d_consumer->provider; cbp->bio_caller1 = disk; if (length == 0) break; } KASSERT(length == 0, ("Length is still greater than 0 (class=%s, name=%s).", bp->bio_to->geom->class->name, bp->bio_to->geom->name)); while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, disk->d_consumer); } } static void g_concat_check_and_run(struct g_concat_softc *sc) { struct g_concat_disk *disk; struct g_provider *dp, *pp; u_int no, sectorsize = 0; off_t start; int error; g_topology_assert(); if (g_concat_nvalid(sc) != sc->sc_ndisks) return; pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE | G_PF_ACCEPT_UNMAPPED; start = 0; for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; dp = disk->d_consumer->provider; disk->d_start = start; disk->d_end = disk->d_start + dp->mediasize; if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) disk->d_end -= dp->sectorsize; start = disk->d_end; error = g_access(disk->d_consumer, 1, 0, 0); if (error == 0) { error = g_getattr("GEOM::candelete", disk->d_consumer, &disk->d_candelete); if (error != 0) disk->d_candelete = 0; (void)g_access(disk->d_consumer, -1, 0, 0); } else G_CONCAT_DEBUG(1, "Failed to access disk %s, error %d.", dp->name, error); if (no == 0) sectorsize = dp->sectorsize; else sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_CONCAT_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->sectorsize = sectorsize; /* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */ pp->mediasize = start; pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize; pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset; sc->sc_provider = pp; g_error_provider(pp, 0); G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ concat_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no) { struct g_concat_disk *disk; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); disk = &sc->sc_disks[no]; /* Check if disk is not already attached. */ if (disk->d_consumer != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) { struct g_concat_metadata md; /* Re-read metadata. */ error = g_concat_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } cp->private = disk; disk->d_consumer = cp; disk->d_softc = sc; disk->d_start = 0; /* not yet */ disk->d_end = 0; /* not yet */ disk->d_removed = 0; G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_concat_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_concat_create(struct g_class *mp, const struct g_concat_metadata *md, u_int type) { struct g_concat_softc *sc; struct g_geom *gp; u_int no; G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disks is minimum. */ if (md->md_all < 1) return (NULL); /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_CONCAT_DEBUG(0, "Device %s already configured.", gp->name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO); gp->start = g_concat_start; gp->spoiled = g_concat_orphan; gp->orphan = g_concat_orphan; gp->access = g_concat_access; gp->dumpconf = g_concat_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks, M_CONCAT, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no].d_consumer = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CONCAT_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CONCAT_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_concat_remove_disk(cp->private); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_CONCAT); mtx_destroy(&sc->sc_lock); free(sc, M_CONCAT); G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_concat_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_concat_softc *sc; sc = gp->softc; return (g_concat_destroy(sc, 0)); } static struct g_geom * g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_concat_metadata md; struct g_concat_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_CONCAT_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "concat:taste"); gp->start = g_concat_start; gp->access = g_concat_access; gp->orphan = g_concat_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_concat_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0) return (NULL); if (md.md_version > G_CONCAT_VERSION) { printf("geom_concat.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 3) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 4) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC); if (gp == NULL) { G_CONCAT_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_concat_destroy(sc, 1); return (NULL); } } return (gp); } static void g_concat_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_concat_metadata md; struct g_provider *pp; struct g_concat_softc *sc; struct g_geom *gp; struct sbuf *sb; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic)); md.md_version = G_CONCAT_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_CONCAT_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); return; } } gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); KASSERT(pp != NULL, ("Provider %s disappear?!", name)); if (g_concat_add_disk(sc, pp, no - 1) != 0) { G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_concat_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_concat_softc * g_concat_find_device(struct g_class *mp, const char *name) { struct g_concat_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_concat_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_concat_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_concat_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CONCAT_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_concat_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_concat_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_concat_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_concat_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_end); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_start); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_CONCAT_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_CONCAT_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_concat_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_concat_class, g_concat); MODULE_VERSION(geom_concat, 0); Index: head/sys/geom/concat/g_concat.h =================================================================== --- head/sys/geom/concat/g_concat.h (revision 350693) +++ head/sys/geom/concat/g_concat.h (revision 350694) @@ -1,130 +1,115 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_CONCAT_H_ #define _G_CONCAT_H_ #include #define G_CONCAT_CLASS_NAME "CONCAT" #define G_CONCAT_MAGIC "GEOM::CONCAT" /* * Version history: * 1 - Initial version number. * 2 - Added 'stop' command to gconcat(8). * 3 - Added md_provider field to metadata and '-h' option to gconcat(8). * 4 - Added md_provsize field to metadata. */ #define G_CONCAT_VERSION 4 #ifdef _KERNEL #define G_CONCAT_TYPE_MANUAL 0 #define G_CONCAT_TYPE_AUTOMATIC 1 -#define G_CONCAT_DEBUG(lvl, ...) do { \ - if (g_concat_debug >= (lvl)) { \ - printf("GEOM_CONCAT"); \ - if (g_concat_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_CONCAT_LOGREQ(bp, ...) do { \ - if (g_concat_debug >= 2) { \ - printf("GEOM_CONCAT[2]: "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_CONCAT_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_CONCAT", g_concat_debug, (lvl), NULL, __VA_ARGS__) +#define G_CONCAT_LOGREQ(bp, ...) \ + _GEOM_DEBUG("GEOM_CONCAT", g_concat_debug, 2, (bp), __VA_ARGS__) struct g_concat_disk { struct g_consumer *d_consumer; struct g_concat_softc *d_softc; off_t d_start; off_t d_end; int d_candelete; int d_removed; }; struct g_concat_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* concat unique ID */ struct g_concat_disk *sc_disks; uint16_t sc_ndisks; struct mtx sc_lock; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_concat_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Concat name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void concat_metadata_encode(const struct g_concat_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); bcopy(md->md_provider, data + 44, sizeof(md->md_provider)); le64enc(data + 60, md->md_provsize); } static __inline void concat_metadata_decode(const u_char *data, struct g_concat_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); bcopy(data + 44, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 60); } #endif /* _G_CONCAT_H_ */ Index: head/sys/geom/eli/g_eli.c =================================================================== --- head/sys/geom/eli/g_eli.c (revision 350693) +++ head/sys/geom/eli/g_eli.c (revision 350694) @@ -1,1439 +1,1440 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2019 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include FEATURE(geom_eli, "GEOM crypto module"); MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW, 0, "GEOM_ELI stuff"); static int g_eli_version = G_ELI_VERSION; SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0, "GELI version"); int g_eli_debug = 0; SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0, "Debug level"); static u_int g_eli_tries = 3; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0, "Number of tries for entering the passphrase"); static u_int g_eli_visible_passphrase = GETS_NOECHO; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN, &g_eli_visible_passphrase, 0, "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)"); u_int g_eli_overwrites = G_ELI_OVERWRITES; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites, 0, "Number of times on-disk keys should be overwritten when destroying them"); static u_int g_eli_threads = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0, "Number of threads doing crypto work"); u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. */ static char cached_passphrase[256]; static u_int g_eli_boot_passcache = 1; TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD, &g_eli_boot_passcache, 0, "Passphrases are cached during boot process for possible reuse"); static void fetch_loader_passphrase(void * dummy) { char * env_passphrase; KASSERT(dynamic_kenv, ("need dynamic kenv")); if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) { /* Extract passphrase from the environment. */ strlcpy(cached_passphrase, env_passphrase, sizeof(cached_passphrase)); freeenv(env_passphrase); /* Wipe the passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); } } SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY, fetch_loader_passphrase, NULL); static void zero_boot_passcache(void) { explicit_bzero(cached_passphrase, sizeof(cached_passphrase)); } static void zero_geli_intake_keys(void) { struct keybuf *keybuf; int i; if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, clear all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { explicit_bzero(keybuf->kb_ents[i].ke_data, sizeof(keybuf->kb_ents[i].ke_data)); keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE; } } } } static void zero_intake_passcache(void *dummy) { zero_boot_passcache(); zero_geli_intake_keys(); } EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0); static eventhandler_tag g_eli_pre_sync = NULL; static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp, off_t offset, struct g_eli_metadata *md); static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_eli_init(struct g_class *mp); static void g_eli_fini(struct g_class *mp); static g_taste_t g_eli_taste; static g_dumpconf_t g_eli_dumpconf; struct g_class g_eli_class = { .name = G_ELI_CLASS_NAME, .version = G_VERSION, .ctlreq = g_eli_config, .taste = g_eli_taste, .destroy_geom = g_eli_destroy_geom, .init = g_eli_init, .fini = g_eli_fini }; /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ /* * EAGAIN from crypto(9) means, that we were probably balanced to another crypto * accelerator or something like this. * The function updates the SID and rerun the operation. */ int g_eli_crypto_rerun(struct cryptop *crp) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; bp = (struct bio *)crp->crp_opaque; sc = bp->bio_to->geom->softc; LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_number == bp->bio_pflags) break; } KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags)); G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).", bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid, crp->crp_session); wr->w_sid = crp->crp_session; crp->crp_etype = 0; error = crypto_dispatch(crp); if (error == 0) return (0); G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error); crp->crp_etype = error; return (error); } static void g_eli_getattr_done(struct bio *bp) { if (bp->bio_error == 0 && !strcmp(bp->bio_attribute, "GEOM::physpath")) { strlcat(bp->bio_data, "/eli", bp->bio_length); } g_std_done(bp); } /* * The function is called afer reading encrypted data from the provider. * * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_read_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; sc = pbp->bio_to->geom->softc; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; if (pbp->bio_driver2 != NULL) { free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; } g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } /* * The function is called after we encrypt and write data. * * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver */ void g_eli_write_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; } else pbp->bio_completed = pbp->bio_length; /* * Write is finished, send it up. */ sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); } /* * This function should never be called, but GEOM made as it set ->orphan() * method for every geom. */ static void g_eli_orphan_spoil_assert(struct g_consumer *cp) { panic("Function %s() called for %s.", __func__, cp->geom->name); } static void g_eli_orphan(struct g_consumer *cp) { struct g_eli_softc *sc; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; g_eli_destroy(sc, TRUE); } static void g_eli_resize(struct g_consumer *cp) { struct g_eli_softc *sc; struct g_provider *epp, *pp; off_t oldsize; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; if ((sc->sc_flags & G_ELI_FLAG_AUTORESIZE) == 0) { G_ELI_DEBUG(0, "Autoresize is turned off, old size: %jd.", (intmax_t)sc->sc_provsize); return; } pp = cp->provider; if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) { struct g_eli_metadata md; u_char *sector; int error; sector = NULL; error = g_eli_read_metadata_offset(cp->geom->class, pp, sc->sc_provsize - pp->sectorsize, &md); if (error != 0) { G_ELI_DEBUG(0, "Cannot read metadata from %s (error=%d).", pp->name, error); goto iofail; } md.md_provsize = pp->mediasize; sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot store metadata on %s (error=%d).", pp->name, error); goto iofail; } explicit_bzero(sector, pp->sectorsize); error = g_write_data(cp, sc->sc_provsize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot clear old metadata from %s (error=%d).", pp->name, error); goto iofail; } iofail: explicit_bzero(&md, sizeof(md)); if (sector != NULL) { explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); } } oldsize = sc->sc_mediasize; sc->sc_mediasize = eli_mediasize(sc, pp->mediasize, pp->sectorsize); g_eli_key_resize(sc); sc->sc_provsize = pp->mediasize; epp = LIST_FIRST(&sc->sc_geom->provider); g_resize_provider(epp, sc->sc_mediasize); G_ELI_DEBUG(0, "Device %s size changed from %jd to %jd.", epp->name, (intmax_t)oldsize, (intmax_t)sc->sc_mediasize); } /* * BIO_READ: * G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ static void g_eli_start(struct bio *bp) { struct g_eli_softc *sc; struct g_consumer *cp; struct bio *cbp; sc = bp->bio_to->geom->softc; KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_ELI_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_GETATTR: case BIO_FLUSH: case BIO_ZONE: break; case BIO_DELETE: /* * If the user hasn't set the NODELETE flag, we just pass * it down the stack and let the layers beneath us do (or * not) whatever they do with it. If they have, we * reject it. A possible extension would be an * additional flag to take it as a hint to shred the data * with [multiple?] overwrites. */ if (!(sc->sc_flags & G_ELI_FLAG_NODELETE)) break; default: g_io_deliver(bp, EOPNOTSUPP); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = cbp; bp->bio_pflags = G_ELI_NEW_BIO; switch (bp->bio_cmd) { case BIO_READ: if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) { g_eli_crypto_read(sc, bp, 0); break; } /* FALLTHROUGH */ case BIO_WRITE: mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); break; case BIO_GETATTR: case BIO_FLUSH: case BIO_DELETE: case BIO_ZONE: if (bp->bio_cmd == BIO_GETATTR) cbp->bio_done = g_eli_getattr_done; else cbp->bio_done = g_std_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); break; } } static int g_eli_newsession(struct g_eli_worker *wr) { struct g_eli_softc *sc; struct cryptoini crie, cria; int error; sc = wr->w_softc; bzero(&crie, sizeof(crie)); crie.cri_alg = sc->sc_ealgo; crie.cri_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crie.cri_klen <<= 1; if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) { crie.cri_key = g_eli_key_hold(sc, 0, LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize); } else { crie.cri_key = sc->sc_ekey; } if (sc->sc_flags & G_ELI_FLAG_AUTH) { bzero(&cria, sizeof(cria)); cria.cri_alg = sc->sc_aalgo; cria.cri_klen = sc->sc_akeylen; cria.cri_key = sc->sc_akey; crie.cri_next = &cria; } switch (sc->sc_crypto) { case G_ELI_CRYPTO_SW: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_SOFTWARE); break; case G_ELI_CRYPTO_HW: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_HARDWARE); break; case G_ELI_CRYPTO_UNKNOWN: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_HARDWARE); if (error == 0) { mtx_lock(&sc->sc_queue_mtx); if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) sc->sc_crypto = G_ELI_CRYPTO_HW; mtx_unlock(&sc->sc_queue_mtx); } else { error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_SOFTWARE); mtx_lock(&sc->sc_queue_mtx); if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) sc->sc_crypto = G_ELI_CRYPTO_SW; mtx_unlock(&sc->sc_queue_mtx); } break; default: panic("%s: invalid condition", __func__); } if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) g_eli_key_drop(sc, crie.cri_key); return (error); } static void g_eli_freesession(struct g_eli_worker *wr) { crypto_freesession(wr->w_sid); } static void g_eli_cancel(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { KASSERT(bp->bio_pflags == G_ELI_NEW_BIO, ("Not new bio when canceling (bp=%p).", bp)); g_io_deliver(bp, ENXIO); } } static struct bio * g_eli_takefirst(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) return (bioq_takefirst(&sc->sc_queue)); /* * Device suspended, so we skip new I/O requests. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_pflags != G_ELI_NEW_BIO) break; } if (bp != NULL) bioq_remove(&sc->sc_queue, bp); return (bp); } /* * This is the main function for kernel worker thread when we don't have * hardware acceleration and we have to do cryptography in software. * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM * threads with crypto work. */ static void g_eli_worker(void *arg) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; wr = arg; sc = wr->w_softc; #ifdef EARLY_AP_STARTUP MPASS(!sc->sc_cpubind || smp_started); #elif defined(SMP) /* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */ if (sc->sc_cpubind) { while (!smp_started) tsleep(wr, 0, "geli:smp", hz / 4); } #endif thread_lock(curthread); sched_prio(curthread, PUSER); if (sc->sc_cpubind) sched_bind(curthread, wr->w_number % mp_ncpus); thread_unlock(curthread); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); for (;;) { mtx_lock(&sc->sc_queue_mtx); again: bp = g_eli_takefirst(sc); if (bp == NULL) { if (sc->sc_flags & G_ELI_FLAG_DESTROY) { g_eli_cancel(sc); LIST_REMOVE(wr, w_next); g_eli_freesession(wr); free(wr, M_ELI); G_ELI_DEBUG(1, "Thread %s exiting.", curthread->td_proc->p_comm); wakeup(&sc->sc_workers); mtx_unlock(&sc->sc_queue_mtx); kproc_exit(0); } while (sc->sc_flags & G_ELI_FLAG_SUSPEND) { if (sc->sc_inflight > 0) { G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight); /* * We still have inflight BIOs, so * sleep and retry. */ msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:inf", hz / 5); goto again; } /* * Suspend requested, mark the worker as * suspended and go to sleep. */ if (wr->w_active) { g_eli_freesession(wr); wr->w_active = FALSE; } wakeup(&sc->sc_workers); msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); if (!wr->w_active && !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) { error = g_eli_newsession(wr); KASSERT(error == 0, ("g_eli_newsession() failed on resume (error=%d)", error)); wr->w_active = TRUE; } goto again; } msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0); continue; } if (bp->bio_pflags == G_ELI_NEW_BIO) atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_pflags == G_ELI_NEW_BIO) { bp->bio_pflags = 0; if (sc->sc_flags & G_ELI_FLAG_AUTH) { if (bp->bio_cmd == BIO_READ) g_eli_auth_read(sc, bp); else g_eli_auth_run(wr, bp); } else { if (bp->bio_cmd == BIO_READ) g_eli_crypto_read(sc, bp, 1); else g_eli_crypto_run(wr, bp); } } else { if (sc->sc_flags & G_ELI_FLAG_AUTH) g_eli_auth_run(wr, bp); else g_eli_crypto_run(wr, bp); } } } static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp, off_t offset, struct g_eli_metadata *md) { struct g_geom *gp; struct g_consumer *cp; u_char *buf = NULL; int error; g_topology_assert(); gp = g_new_geomf(mp, "eli:taste"); gp->start = g_eli_start; gp->access = g_std_access; /* * g_eli_read_metadata() is always called from the event thread. * Our geom is created and destroyed in the same event, so there * could be no orphan nor spoil event in the meantime. */ gp->orphan = g_eli_orphan_spoil_assert; gp->spoiled = g_eli_orphan_spoil_assert; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) goto end; error = g_access(cp, 1, 0, 0); if (error != 0) goto end; g_topology_unlock(); buf = g_read_data(cp, offset, pp->sectorsize, &error); g_topology_lock(); if (buf == NULL) goto end; error = eli_metadata_decode(buf, md); if (error != 0) goto end; /* Metadata was read and decoded successfully. */ end: if (buf != NULL) g_free(buf); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); return (error); } int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md) { return (g_eli_read_metadata_offset(mp, pp, pp->mediasize - pp->sectorsize, md)); } /* * The function is called when we had last close on provider and user requested * to close it when this situation occur. */ static void g_eli_last_close(void *arg, int flags __unused) { struct g_geom *gp; char gpname[64]; int error; g_topology_assert(); gp = arg; strlcpy(gpname, gp->name, sizeof(gpname)); error = g_eli_destroy(gp->softc, TRUE); KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).", gpname, error)); G_ELI_DEBUG(0, "Detached %s on last close.", gpname); } int g_eli_access(struct g_provider *pp, int dr, int dw, int de) { struct g_eli_softc *sc; struct g_geom *gp; gp = pp->geom; sc = gp->softc; if (dw > 0) { if (sc->sc_flags & G_ELI_FLAG_RO) { /* Deny write attempts. */ return (EROFS); } /* Someone is opening us for write, we need to remember that. */ sc->sc_flags |= G_ELI_FLAG_WOPEN; return (0); } /* Is this the last close? */ if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0) return (0); /* * Automatically detach on last close if requested. */ if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) || (sc->sc_flags & G_ELI_FLAG_WOPEN)) { g_post_event(g_eli_last_close, gp, M_WAITOK, NULL); } return (0); } static int g_eli_cpu_is_disabled(int cpu) { #ifdef SMP return (CPU_ISSET(cpu, &hlt_cpus_mask)); #else return (0); #endif } struct g_geom * g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; u_int i, threads; int dcw, error; G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX); gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX); sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO); gp->start = g_eli_start; /* * Spoiling can happen even though we have the provider open * exclusively, e.g. through media change events. */ gp->spoiled = g_eli_orphan; gp->orphan = g_eli_orphan; gp->resize = g_eli_resize; gp->dumpconf = g_eli_dumpconf; /* * If detach-on-last-close feature is not enabled and we don't operate * on read-only provider, we can simply use g_std_access(). */ if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO)) gp->access = g_eli_access; else gp->access = g_std_access; eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize); sc->sc_nkey = nkey; gp->softc = sc; sc->sc_geom = gp; bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF); mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF); pp = NULL; cp = g_new_consumer(gp); error = g_attach(cp, bpp); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot attach to %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).", bpp->name, error); } goto failed; } /* * Keep provider open all the time, so we can run critical tasks, * like Master Keys deletion, without wondering if we can open * provider or not. * We don't open provider for writing only when user requested read-only * access. */ dcw = (sc->sc_flags & G_ELI_FLAG_RO) ? 0 : 1; error = g_access(cp, 1, dcw, 1); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot access %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot access %s (error=%d).", bpp->name, error); } goto failed; } /* * Remember the keys in our softc structure. */ g_eli_mkey_propagate(sc, mkey); LIST_INIT(&sc->sc_workers); threads = g_eli_threads; if (threads == 0) threads = mp_ncpus; sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus); for (i = 0; i < threads; i++) { if (g_eli_cpu_is_disabled(i)) { G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.", bpp->name, i); continue; } wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO); wr->w_softc = sc; wr->w_number = i; wr->w_active = TRUE; error = g_eli_newsession(wr); if (error != 0) { free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } goto failed; } error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0, "g_eli[%u] %s", i, bpp->name); if (error != 0) { g_eli_freesession(wr); free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } goto failed; } LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next); } /* * Create decrypted provider. */ pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); G_ELI_DEBUG(0, "Device %s created.", pp->name); G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo), sc->sc_ekeylen); switch (sc->sc_ealgo) { case CRYPTO_3DES_CBC: gone_in(13, "support for GEOM_ELI volumes encrypted with 3des"); break; case CRYPTO_BLF_CBC: gone_in(13, "support for GEOM_ELI volumes encrypted with blowfish"); break; } if (sc->sc_flags & G_ELI_FLAG_AUTH) { G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo)); switch (sc->sc_aalgo) { case CRYPTO_MD5_HMAC: gone_in(13, "support for GEOM_ELI volumes authenticated with hmac/md5"); break; } } G_ELI_DEBUG(0, " Crypto: %s", sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware"); return (gp); failed: mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); /* * Wait for kernel threads self destruction. */ while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, -dcw, -1); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); g_eli_key_destroy(sc); bzero(sc, sizeof(*sc)); free(sc, M_ELI); return (NULL); } int g_eli_destroy(struct g_eli_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_ELI_DEBUG(1, "Device %s is still open, so it " "cannot be definitely removed.", pp->name); sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; g_wither_provider(pp, ENXIO); return (EBUSY); } else { G_ELI_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); gp->softc = NULL; g_eli_key_destroy(sc); bzero(sc, sizeof(*sc)); free(sc, M_ELI); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_ELI_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom_close(gp, ENXIO); return (0); } static int g_eli_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_eli_softc *sc; sc = gp->softc; return (g_eli_destroy(sc, FALSE)); } static int g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider) { u_char *keyfile, *data; char *file, name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL && i == 0) { /* * If there is only one keyfile, allow simpler name. */ snprintf(name, sizeof(name), "%s:geli_keyfile", provider); keyfile = preload_search_by_type(name); } if (keyfile == NULL) return (i); /* Return number of loaded keyfiles. */ data = preload_fetch_addr(keyfile); if (data == NULL) { G_ELI_DEBUG(0, "Cannot find key file data for %s.", name); return (0); } size = preload_fetch_size(keyfile); if (size == 0) { G_ELI_DEBUG(0, "Cannot find key file size for %s.", name); return (0); } file = preload_search_info(keyfile, MODINFO_NAME); if (file == NULL) { G_ELI_DEBUG(0, "Cannot find key file name for %s.", name); return (0); } G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file, provider, name); g_eli_crypto_hmac_update(ctx, data, size); } } static void g_eli_keyfiles_clear(const char *provider) { u_char *keyfile, *data; char name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL) return; data = preload_fetch_addr(keyfile); size = preload_fetch_size(keyfile); if (data != NULL && size != 0) bzero(data, size); } } /* * Tasting is only made on boot. * We detect providers which should be attached before root is mounted. */ static struct g_geom * g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_eli_metadata md; struct g_geom *gp; struct hmac_ctx ctx; char passphrase[256]; u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN]; u_int i, nkey, nkeyfiles, tries, showpass; int error; struct keybuf *keybuf; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); if (root_mounted() || g_eli_tries == 0) return (NULL); G_ELI_DEBUG(3, "Tasting %s.", pp->name); error = g_eli_read_metadata(mp, pp, &md); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_ELI_MAGIC) != 0) return (NULL); if (md.md_version > G_ELI_VERSION) { printf("geom_eli.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); /* Should we attach it on boot? */ if (!(md.md_flags & G_ELI_FLAG_BOOT)) return (NULL); if (md.md_keys == 0x00) { G_ELI_DEBUG(0, "No valid keys on %s.", pp->name); return (NULL); } if (md.md_iterations == -1) { /* If there is no passphrase, we try only once. */ tries = 1; } else { /* Ask for the passphrase no more than g_eli_tries times. */ tries = g_eli_tries; } if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, try all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { memcpy(key, keybuf->kb_ents[i].ke_data, sizeof(key)); if (g_eli_mkey_decrypt_any(&md, key, mkey, &nkey) == 0 ) { explicit_bzero(key, sizeof(key)); goto have_key; } } } } for (i = 0; i <= tries; i++) { g_eli_crypto_hmac_init(&ctx, NULL, 0); /* * Load all key files. */ nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name); if (nkeyfiles == 0 && md.md_iterations == -1) { /* * No key files and no passphrase, something is * definitely wrong here. * geli(8) doesn't allow for such situation, so assume * that there was really no passphrase and in that case * key files are no properly defined in loader.conf. */ G_ELI_DEBUG(0, "Found no key files in loader.conf for %s.", pp->name); return (NULL); } /* Ask for the passphrase if defined. */ if (md.md_iterations >= 0) { /* Try first with cached passphrase. */ if (i == 0) { if (!g_eli_boot_passcache) continue; memcpy(passphrase, cached_passphrase, sizeof(passphrase)); } else { printf("Enter passphrase for %s: ", pp->name); showpass = g_eli_visible_passphrase; if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0) showpass = GETS_ECHOPASS; cngets(passphrase, sizeof(passphrase), showpass); memcpy(cached_passphrase, passphrase, sizeof(passphrase)); } } /* * Prepare Derived-Key from the user passphrase. */ if (md.md_iterations == 0) { g_eli_crypto_hmac_update(&ctx, md.md_salt, sizeof(md.md_salt)); g_eli_crypto_hmac_update(&ctx, passphrase, strlen(passphrase)); explicit_bzero(passphrase, sizeof(passphrase)); } else if (md.md_iterations > 0) { u_char dkey[G_ELI_USERKEYLEN]; pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt, sizeof(md.md_salt), passphrase, md.md_iterations); bzero(passphrase, sizeof(passphrase)); g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey)); explicit_bzero(dkey, sizeof(dkey)); } g_eli_crypto_hmac_final(&ctx, key, 0); /* * Decrypt Master-Key. */ error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); bzero(key, sizeof(key)); if (error == -1) { if (i == tries) { G_ELI_DEBUG(0, "Wrong key for %s. No tries left.", pp->name); g_eli_keyfiles_clear(pp->name); return (NULL); } if (i > 0) { G_ELI_DEBUG(0, "Wrong key for %s. Tries left: %u.", pp->name, tries - i); } /* Try again. */ continue; } else if (error > 0) { G_ELI_DEBUG(0, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); g_eli_keyfiles_clear(pp->name); return (NULL); } g_eli_keyfiles_clear(pp->name); G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); break; } have_key: /* * We have correct key, let's attach provider. */ gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey); bzero(mkey, sizeof(mkey)); bzero(&md, sizeof(md)); if (gp == NULL) { G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name, G_ELI_SUFFIX); return (NULL); } return (gp); } static void g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_eli_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL || cp != NULL) return; /* Nothing here. */ sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_total); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_allocated); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if (sc->sc_flags & (flag)) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND"); ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY"); ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER"); ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME"); ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT"); ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH"); ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH"); ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH"); ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN"); ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY"); ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY"); ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE"); ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT"); ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS"); ADD_FLAG(G_ELI_FLAG_AUTORESIZE, "AUTORESIZE"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_nkey); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_version); sbuf_printf(sb, "%s", indent); switch (sc->sc_crypto) { case G_ELI_CRYPTO_HW: sbuf_cat(sb, "hardware"); break; case G_ELI_CRYPTO_SW: sbuf_cat(sb, "software"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); if (sc->sc_flags & G_ELI_FLAG_AUTH) { sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_aalgo)); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_ekeylen); sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_ealgo)); sbuf_printf(sb, "%s%s\n", indent, (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE"); } static void g_eli_shutdown_pre_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp; struct g_eli_softc *sc; int error; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name)); if (pp->acr + pp->acw + pp->ace == 0) error = g_eli_destroy(sc, TRUE); else { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; } } g_topology_unlock(); } static void g_eli_init(struct g_class *mp) { g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) G_ELI_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_eli_fini(struct g_class *mp) { if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } DECLARE_GEOM_CLASS(g_eli_class, g_eli); MODULE_DEPEND(g_eli, crypto, 1, 1, 1); MODULE_VERSION(geom_eli, 0); Index: head/sys/geom/eli/g_eli.h =================================================================== --- head/sys/geom/eli/g_eli.h (revision 350693) +++ head/sys/geom/eli/g_eli.h (revision 350694) @@ -1,741 +1,723 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2019 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_ELI_H_ #define _G_ELI_H_ #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #else #include #include #include #include #endif #include #include #ifndef _OpenSSL_ #include #endif #define G_ELI_CLASS_NAME "ELI" #define G_ELI_MAGIC "GEOM::ELI" #define G_ELI_SUFFIX ".eli" /* * Version history: * 0 - Initial version number. * 1 - Added data authentication support (md_aalgo field and * G_ELI_FLAG_AUTH flag). * 2 - Added G_ELI_FLAG_READONLY. * 3 - Added 'configure' subcommand. * 4 - IV is generated from offset converted to little-endian * (the G_ELI_FLAG_NATIVE_BYTE_ORDER flag will be set for older versions). * 5 - Added multiple encrypton keys and AES-XTS support. * 6 - Fixed usage of multiple keys for authenticated providers (the * G_ELI_FLAG_FIRST_KEY flag will be set for older versions). * 7 - Encryption keys are now generated from the Data Key and not from the * IV Key (the G_ELI_FLAG_ENC_IVKEY flag will be set for older versions). */ #define G_ELI_VERSION_00 0 #define G_ELI_VERSION_01 1 #define G_ELI_VERSION_02 2 #define G_ELI_VERSION_03 3 #define G_ELI_VERSION_04 4 #define G_ELI_VERSION_05 5 #define G_ELI_VERSION_06 6 #define G_ELI_VERSION_07 7 #define G_ELI_VERSION G_ELI_VERSION_07 /* ON DISK FLAGS. */ /* Use random, onetime keys. */ #define G_ELI_FLAG_ONETIME 0x00000001 /* Ask for the passphrase from the kernel, before mounting root. */ #define G_ELI_FLAG_BOOT 0x00000002 /* Detach on last close, if we were open for writing. */ #define G_ELI_FLAG_WO_DETACH 0x00000004 /* Detach on last close. */ #define G_ELI_FLAG_RW_DETACH 0x00000008 /* Provide data authentication. */ #define G_ELI_FLAG_AUTH 0x00000010 /* Provider is read-only, we should deny all write attempts. */ #define G_ELI_FLAG_RO 0x00000020 /* Don't pass through BIO_DELETE requests. */ #define G_ELI_FLAG_NODELETE 0x00000040 /* This GELI supports GELIBoot */ #define G_ELI_FLAG_GELIBOOT 0x00000080 /* Hide passphrase length in GELIboot. */ #define G_ELI_FLAG_GELIDISPLAYPASS 0x00000100 /* Expand provider automatically. */ #define G_ELI_FLAG_AUTORESIZE 0x00000200 /* RUNTIME FLAGS. */ /* Provider was open for writing. */ #define G_ELI_FLAG_WOPEN 0x00010000 /* Destroy device. */ #define G_ELI_FLAG_DESTROY 0x00020000 /* Provider uses native byte-order for IV generation. */ #define G_ELI_FLAG_NATIVE_BYTE_ORDER 0x00040000 /* Provider uses single encryption key. */ #define G_ELI_FLAG_SINGLE_KEY 0x00080000 /* Device suspended. */ #define G_ELI_FLAG_SUSPEND 0x00100000 /* Provider uses first encryption key. */ #define G_ELI_FLAG_FIRST_KEY 0x00200000 /* Provider uses IV-Key for encryption key generation. */ #define G_ELI_FLAG_ENC_IVKEY 0x00400000 #define G_ELI_NEW_BIO 255 #define SHA512_MDLEN 64 #define G_ELI_AUTH_SECKEYLEN SHA256_DIGEST_LENGTH #define G_ELI_MAXMKEYS 2 #define G_ELI_MAXKEYLEN 64 #define G_ELI_USERKEYLEN G_ELI_MAXKEYLEN #define G_ELI_DATAKEYLEN G_ELI_MAXKEYLEN #define G_ELI_AUTHKEYLEN G_ELI_MAXKEYLEN #define G_ELI_IVKEYLEN G_ELI_MAXKEYLEN #define G_ELI_SALTLEN 64 #define G_ELI_DATAIVKEYLEN (G_ELI_DATAKEYLEN + G_ELI_IVKEYLEN) /* Data-Key, IV-Key, HMAC_SHA512(Derived-Key, Data-Key+IV-Key) */ #define G_ELI_MKEYLEN (G_ELI_DATAIVKEYLEN + SHA512_MDLEN) #define G_ELI_OVERWRITES 5 /* Switch data encryption key every 2^20 blocks. */ #define G_ELI_KEY_SHIFT 20 #define G_ELI_CRYPTO_UNKNOWN 0 #define G_ELI_CRYPTO_HW 1 #define G_ELI_CRYPTO_SW 2 #ifdef _KERNEL #if (MAX_KEY_BYTES < G_ELI_DATAIVKEYLEN) #error "MAX_KEY_BYTES is less than G_ELI_DATAKEYLEN" #endif extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; -#define G_ELI_DEBUG(lvl, ...) do { \ - if (g_eli_debug >= (lvl)) { \ - printf("GEOM_ELI"); \ - if (g_eli_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_ELI_LOGREQ(lvl, bp, ...) do { \ - if (g_eli_debug >= (lvl)) { \ - printf("GEOM_ELI"); \ - if (g_eli_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_ELI_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_ELI", g_eli_debug, (lvl), NULL, __VA_ARGS__) +#define G_ELI_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_ELI", g_eli_debug, (lvl), (bp), __VA_ARGS__) struct g_eli_worker { struct g_eli_softc *w_softc; struct proc *w_proc; u_int w_number; crypto_session_t w_sid; boolean_t w_active; LIST_ENTRY(g_eli_worker) w_next; }; #endif /* _KERNEL */ struct g_eli_softc { struct g_geom *sc_geom; u_int sc_version; u_int sc_crypto; uint8_t sc_mkey[G_ELI_DATAIVKEYLEN]; uint8_t sc_ekey[G_ELI_DATAKEYLEN]; TAILQ_HEAD(, g_eli_key) sc_ekeys_queue; RB_HEAD(g_eli_key_tree, g_eli_key) sc_ekeys_tree; struct mtx sc_ekeys_lock; uint64_t sc_ekeys_total; uint64_t sc_ekeys_allocated; u_int sc_ealgo; u_int sc_ekeylen; uint8_t sc_akey[G_ELI_AUTHKEYLEN]; u_int sc_aalgo; u_int sc_akeylen; u_int sc_alen; SHA256_CTX sc_akeyctx; uint8_t sc_ivkey[G_ELI_IVKEYLEN]; SHA256_CTX sc_ivctx; int sc_nkey; uint32_t sc_flags; int sc_inflight; off_t sc_mediasize; size_t sc_sectorsize; off_t sc_provsize; u_int sc_bytes_per_sector; u_int sc_data_per_sector; #ifndef _KERNEL int sc_cpubind; #else /* _KERNEL */ boolean_t sc_cpubind; /* Only for software cryptography. */ struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; LIST_HEAD(, g_eli_worker) sc_workers; #endif /* _KERNEL */ }; #define sc_name sc_geom->name #define G_ELI_KEY_MAGIC 0xe11341c struct g_eli_key { /* Key value, must be first in the structure. */ uint8_t gek_key[G_ELI_DATAKEYLEN]; /* Magic. */ int gek_magic; /* Key number. */ uint64_t gek_keyno; /* Reference counter. */ int gek_count; /* Keeps keys sorted by most recent use. */ TAILQ_ENTRY(g_eli_key) gek_next; /* Keeps keys sorted by number. */ RB_ENTRY(g_eli_key) gek_link; }; struct g_eli_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ uint32_t md_flags; /* Additional flags. */ uint16_t md_ealgo; /* Encryption algorithm. */ uint16_t md_keylen; /* Key length. */ uint16_t md_aalgo; /* Authentication algorithm. */ uint64_t md_provsize; /* Provider's size. */ uint32_t md_sectorsize; /* Sector size. */ uint8_t md_keys; /* Available keys. */ int32_t md_iterations; /* Number of iterations for PKCS#5v2. */ uint8_t md_salt[G_ELI_SALTLEN]; /* Salt. */ /* Encrypted master key (IV-key, Data-key, HMAC). */ uint8_t md_mkeys[G_ELI_MAXMKEYS * G_ELI_MKEYLEN]; u_char md_hash[16]; /* MD5 hash. */ } __packed; #ifndef _OpenSSL_ static __inline void eli_metadata_encode_v0(struct g_eli_metadata *md, u_char **datap) { u_char *p; p = *datap; le32enc(p, md->md_flags); p += sizeof(md->md_flags); le16enc(p, md->md_ealgo); p += sizeof(md->md_ealgo); le16enc(p, md->md_keylen); p += sizeof(md->md_keylen); le64enc(p, md->md_provsize); p += sizeof(md->md_provsize); le32enc(p, md->md_sectorsize); p += sizeof(md->md_sectorsize); *p = md->md_keys; p += sizeof(md->md_keys); le32enc(p, md->md_iterations); p += sizeof(md->md_iterations); bcopy(md->md_salt, p, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(md->md_mkeys, p, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); *datap = p; } static __inline void eli_metadata_encode_v1v2v3v4v5v6v7(struct g_eli_metadata *md, u_char **datap) { u_char *p; p = *datap; le32enc(p, md->md_flags); p += sizeof(md->md_flags); le16enc(p, md->md_ealgo); p += sizeof(md->md_ealgo); le16enc(p, md->md_keylen); p += sizeof(md->md_keylen); le16enc(p, md->md_aalgo); p += sizeof(md->md_aalgo); le64enc(p, md->md_provsize); p += sizeof(md->md_provsize); le32enc(p, md->md_sectorsize); p += sizeof(md->md_sectorsize); *p = md->md_keys; p += sizeof(md->md_keys); le32enc(p, md->md_iterations); p += sizeof(md->md_iterations); bcopy(md->md_salt, p, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(md->md_mkeys, p, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); *datap = p; } static __inline void eli_metadata_encode(struct g_eli_metadata *md, u_char *data) { uint32_t hash[4]; MD5_CTX ctx; u_char *p; p = data; bcopy(md->md_magic, p, sizeof(md->md_magic)); p += sizeof(md->md_magic); le32enc(p, md->md_version); p += sizeof(md->md_version); switch (md->md_version) { case G_ELI_VERSION_00: eli_metadata_encode_v0(md, &p); break; case G_ELI_VERSION_01: case G_ELI_VERSION_02: case G_ELI_VERSION_03: case G_ELI_VERSION_04: case G_ELI_VERSION_05: case G_ELI_VERSION_06: case G_ELI_VERSION_07: eli_metadata_encode_v1v2v3v4v5v6v7(md, &p); break; default: #ifdef _KERNEL panic("%s: Unsupported version %u.", __func__, (u_int)md->md_version); #else assert(!"Unsupported metadata version."); #endif } MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); bcopy(md->md_hash, p, sizeof(md->md_hash)); } static __inline int eli_metadata_decode_v0(const u_char *data, struct g_eli_metadata *md) { uint32_t hash[4]; MD5_CTX ctx; const u_char *p; p = data + sizeof(md->md_magic) + sizeof(md->md_version); md->md_flags = le32dec(p); p += sizeof(md->md_flags); md->md_ealgo = le16dec(p); p += sizeof(md->md_ealgo); md->md_keylen = le16dec(p); p += sizeof(md->md_keylen); md->md_provsize = le64dec(p); p += sizeof(md->md_provsize); md->md_sectorsize = le32dec(p); p += sizeof(md->md_sectorsize); md->md_keys = *p; p += sizeof(md->md_keys); md->md_iterations = le32dec(p); p += sizeof(md->md_iterations); bcopy(p, md->md_salt, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(p, md->md_mkeys, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); if (bcmp(md->md_hash, p, 16) != 0) return (EINVAL); return (0); } static __inline int eli_metadata_decode_v1v2v3v4v5v6v7(const u_char *data, struct g_eli_metadata *md) { uint32_t hash[4]; MD5_CTX ctx; const u_char *p; p = data + sizeof(md->md_magic) + sizeof(md->md_version); md->md_flags = le32dec(p); p += sizeof(md->md_flags); md->md_ealgo = le16dec(p); p += sizeof(md->md_ealgo); md->md_keylen = le16dec(p); p += sizeof(md->md_keylen); md->md_aalgo = le16dec(p); p += sizeof(md->md_aalgo); md->md_provsize = le64dec(p); p += sizeof(md->md_provsize); md->md_sectorsize = le32dec(p); p += sizeof(md->md_sectorsize); md->md_keys = *p; p += sizeof(md->md_keys); md->md_iterations = le32dec(p); p += sizeof(md->md_iterations); bcopy(p, md->md_salt, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(p, md->md_mkeys, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); if (bcmp(md->md_hash, p, 16) != 0) return (EINVAL); return (0); } static __inline int eli_metadata_decode(const u_char *data, struct g_eli_metadata *md) { int error; bcopy(data, md->md_magic, sizeof(md->md_magic)); if (strcmp(md->md_magic, G_ELI_MAGIC) != 0) return (EINVAL); md->md_version = le32dec(data + sizeof(md->md_magic)); switch (md->md_version) { case G_ELI_VERSION_00: error = eli_metadata_decode_v0(data, md); break; case G_ELI_VERSION_01: case G_ELI_VERSION_02: case G_ELI_VERSION_03: case G_ELI_VERSION_04: case G_ELI_VERSION_05: case G_ELI_VERSION_06: case G_ELI_VERSION_07: error = eli_metadata_decode_v1v2v3v4v5v6v7(data, md); break; default: error = EOPNOTSUPP; break; } return (error); } #endif /* !_OpenSSL */ static __inline u_int g_eli_str2ealgo(const char *name) { if (strcasecmp("null", name) == 0) return (CRYPTO_NULL_CBC); else if (strcasecmp("null-cbc", name) == 0) return (CRYPTO_NULL_CBC); else if (strcasecmp("aes", name) == 0) return (CRYPTO_AES_XTS); else if (strcasecmp("aes-cbc", name) == 0) return (CRYPTO_AES_CBC); else if (strcasecmp("aes-xts", name) == 0) return (CRYPTO_AES_XTS); else if (strcasecmp("blowfish", name) == 0) return (CRYPTO_BLF_CBC); else if (strcasecmp("blowfish-cbc", name) == 0) return (CRYPTO_BLF_CBC); else if (strcasecmp("camellia", name) == 0) return (CRYPTO_CAMELLIA_CBC); else if (strcasecmp("camellia-cbc", name) == 0) return (CRYPTO_CAMELLIA_CBC); else if (strcasecmp("3des", name) == 0) return (CRYPTO_3DES_CBC); else if (strcasecmp("3des-cbc", name) == 0) return (CRYPTO_3DES_CBC); return (CRYPTO_ALGORITHM_MIN - 1); } static __inline u_int g_eli_str2aalgo(const char *name) { if (strcasecmp("hmac/md5", name) == 0) return (CRYPTO_MD5_HMAC); else if (strcasecmp("hmac/sha1", name) == 0) return (CRYPTO_SHA1_HMAC); else if (strcasecmp("hmac/ripemd160", name) == 0) return (CRYPTO_RIPEMD160_HMAC); else if (strcasecmp("hmac/sha256", name) == 0) return (CRYPTO_SHA2_256_HMAC); else if (strcasecmp("hmac/sha384", name) == 0) return (CRYPTO_SHA2_384_HMAC); else if (strcasecmp("hmac/sha512", name) == 0) return (CRYPTO_SHA2_512_HMAC); return (CRYPTO_ALGORITHM_MIN - 1); } static __inline const char * g_eli_algo2str(u_int algo) { switch (algo) { case CRYPTO_NULL_CBC: return ("NULL"); case CRYPTO_AES_CBC: return ("AES-CBC"); case CRYPTO_AES_XTS: return ("AES-XTS"); case CRYPTO_BLF_CBC: return ("Blowfish-CBC"); case CRYPTO_CAMELLIA_CBC: return ("CAMELLIA-CBC"); case CRYPTO_3DES_CBC: return ("3DES-CBC"); case CRYPTO_MD5_HMAC: return ("HMAC/MD5"); case CRYPTO_SHA1_HMAC: return ("HMAC/SHA1"); case CRYPTO_RIPEMD160_HMAC: return ("HMAC/RIPEMD160"); case CRYPTO_SHA2_256_HMAC: return ("HMAC/SHA256"); case CRYPTO_SHA2_384_HMAC: return ("HMAC/SHA384"); case CRYPTO_SHA2_512_HMAC: return ("HMAC/SHA512"); } return ("unknown"); } static __inline void eli_metadata_dump(const struct g_eli_metadata *md) { static const char hex[] = "0123456789abcdef"; char str[sizeof(md->md_mkeys) * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" flags: 0x%x\n", (u_int)md->md_flags); printf(" ealgo: %s\n", g_eli_algo2str(md->md_ealgo)); printf(" keylen: %u\n", (u_int)md->md_keylen); if (md->md_flags & G_ELI_FLAG_AUTH) printf(" aalgo: %s\n", g_eli_algo2str(md->md_aalgo)); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf(" keys: 0x%02x\n", (u_int)md->md_keys); printf("iterations: %d\n", (int)md->md_iterations); bzero(str, sizeof(str)); for (i = 0; i < sizeof(md->md_salt); i++) { str[i * 2] = hex[md->md_salt[i] >> 4]; str[i * 2 + 1] = hex[md->md_salt[i] & 0x0f]; } printf(" Salt: %s\n", str); bzero(str, sizeof(str)); for (i = 0; i < sizeof(md->md_mkeys); i++) { str[i * 2] = hex[md->md_mkeys[i] >> 4]; str[i * 2 + 1] = hex[md->md_mkeys[i] & 0x0f]; } printf("Master Key: %s\n", str); bzero(str, sizeof(str)); for (i = 0; i < 16; i++) { str[i * 2] = hex[md->md_hash[i] >> 4]; str[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", str); } static __inline u_int g_eli_keylen(u_int algo, u_int keylen) { switch (algo) { case CRYPTO_NULL_CBC: if (keylen == 0) keylen = 64 * 8; else { if (keylen > 64 * 8) keylen = 0; } return (keylen); case CRYPTO_AES_CBC: case CRYPTO_CAMELLIA_CBC: switch (keylen) { case 0: return (128); case 128: case 192: case 256: return (keylen); default: return (0); } case CRYPTO_AES_XTS: switch (keylen) { case 0: return (128); case 128: case 256: return (keylen); default: return (0); } case CRYPTO_BLF_CBC: if (keylen == 0) return (128); if (keylen < 128 || keylen > 448) return (0); if ((keylen % 32) != 0) return (0); return (keylen); case CRYPTO_3DES_CBC: if (keylen == 0 || keylen == 192) return (192); return (0); default: return (0); } } static __inline u_int g_eli_hashlen(u_int algo) { switch (algo) { case CRYPTO_MD5_HMAC: return (16); case CRYPTO_SHA1_HMAC: return (20); case CRYPTO_RIPEMD160_HMAC: return (20); case CRYPTO_SHA2_256_HMAC: return (32); case CRYPTO_SHA2_384_HMAC: return (48); case CRYPTO_SHA2_512_HMAC: return (64); } return (0); } static __inline off_t eli_mediasize(const struct g_eli_softc *sc, off_t mediasize, u_int sectorsize) { if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) { mediasize -= sectorsize; } if ((sc->sc_flags & G_ELI_FLAG_AUTH) == 0) { mediasize -= (mediasize % sc->sc_sectorsize); } else { mediasize /= sc->sc_bytes_per_sector; mediasize *= sc->sc_sectorsize; } return (mediasize); } static __inline void eli_metadata_softc(struct g_eli_softc *sc, const struct g_eli_metadata *md, u_int sectorsize, off_t mediasize) { sc->sc_version = md->md_version; sc->sc_inflight = 0; sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN; sc->sc_flags = md->md_flags; /* Backward compatibility. */ if (md->md_version < G_ELI_VERSION_04) sc->sc_flags |= G_ELI_FLAG_NATIVE_BYTE_ORDER; if (md->md_version < G_ELI_VERSION_05) sc->sc_flags |= G_ELI_FLAG_SINGLE_KEY; if (md->md_version < G_ELI_VERSION_06 && (sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { sc->sc_flags |= G_ELI_FLAG_FIRST_KEY; } if (md->md_version < G_ELI_VERSION_07) sc->sc_flags |= G_ELI_FLAG_ENC_IVKEY; sc->sc_ealgo = md->md_ealgo; if (sc->sc_flags & G_ELI_FLAG_AUTH) { sc->sc_akeylen = sizeof(sc->sc_akey) * 8; sc->sc_aalgo = md->md_aalgo; sc->sc_alen = g_eli_hashlen(sc->sc_aalgo); sc->sc_data_per_sector = sectorsize - sc->sc_alen; /* * Some hash functions (like SHA1 and RIPEMD160) generates hash * which length is not multiple of 128 bits, but we want data * length to be multiple of 128, so we can encrypt without * padding. The line below rounds down data length to multiple * of 128 bits. */ sc->sc_data_per_sector -= sc->sc_data_per_sector % 16; sc->sc_bytes_per_sector = (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1; sc->sc_bytes_per_sector *= sectorsize; } sc->sc_provsize = mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_mediasize = eli_mediasize(sc, mediasize, sectorsize); sc->sc_ekeylen = md->md_keylen; } #ifdef _KERNEL int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md); struct g_geom *g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey); int g_eli_destroy(struct g_eli_softc *sc, boolean_t force); int g_eli_access(struct g_provider *pp, int dr, int dw, int de); void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb); void g_eli_read_done(struct bio *bp); void g_eli_write_done(struct bio *bp); int g_eli_crypto_rerun(struct cryptop *crp); void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker); void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp); void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp); #endif void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, size_t size); void g_eli_mkey_hmac(unsigned char *mkey, const unsigned char *key); int g_eli_mkey_decrypt(const struct g_eli_metadata *md, const unsigned char *key, unsigned char *mkey, unsigned keyp); int g_eli_mkey_decrypt_any(const struct g_eli_metadata *md, const unsigned char *key, unsigned char *mkey, unsigned *nkeyp); int g_eli_mkey_encrypt(unsigned algo, const unsigned char *key, unsigned keylen, unsigned char *mkey); #ifdef _KERNEL void g_eli_mkey_propagate(struct g_eli_softc *sc, const unsigned char *mkey); #endif int g_eli_crypto_encrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize); int g_eli_crypto_decrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize); struct hmac_ctx { SHA512_CTX innerctx; SHA512_CTX outerctx; }; void g_eli_crypto_hmac_init(struct hmac_ctx *ctx, const char *hkey, size_t hkeylen); void g_eli_crypto_hmac_update(struct hmac_ctx *ctx, const uint8_t *data, size_t datasize); void g_eli_crypto_hmac_final(struct hmac_ctx *ctx, uint8_t *md, size_t mdsize); void g_eli_crypto_hmac(const char *hkey, size_t hkeysize, const uint8_t *data, size_t datasize, uint8_t *md, size_t mdsize); void g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno); #ifdef _KERNEL void g_eli_key_init(struct g_eli_softc *sc); void g_eli_key_destroy(struct g_eli_softc *sc); void g_eli_key_resize(struct g_eli_softc *sc); uint8_t *g_eli_key_hold(struct g_eli_softc *sc, off_t offset, size_t blocksize); void g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey); #endif #endif /* !_G_ELI_H_ */ Index: head/sys/geom/eli/g_eli_ctl.c =================================================================== --- head/sys/geom/eli/g_eli_ctl.c (revision 350693) +++ head/sys/geom/eli/g_eli_ctl.c (revision 350694) @@ -1,1229 +1,1230 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include MALLOC_DECLARE(M_ELI); static void g_eli_ctl_attach(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; const char *name; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *readonly, *dryrunp; int keysize, error, nkey, dryrun, dummy; intmax_t *valp; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach == NULL) { gctl_error(req, "No '%s' argument.", "detach"); return; } /* "keyno" is optional for backward compatibility */ nkey = -1; valp = gctl_get_param(req, "keyno", &dummy); if (valp != NULL) { valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp != NULL) nkey = *valp; } if (nkey < -1 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } readonly = gctl_get_paraml(req, "readonly", sizeof(*readonly)); if (readonly == NULL) { gctl_error(req, "No '%s' argument.", "readonly"); return; } /* "dryrun" is optional for backward compatibility */ dryrun = 0; dryrunp = gctl_get_param(req, "dryrun", &dummy); if (dryrunp != NULL) { dryrunp = gctl_get_paraml(req, "dryrun", sizeof(*dryrunp)); if (dryrunp != NULL) dryrun = *dryrunp; } if (*detach && *readonly) { gctl_error(req, "Options -d and -r are mutually exclusive."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No '%s' argument.", "key"); return; } if (nkey == -1) error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); else error = g_eli_mkey_decrypt(&md, key, mkey, nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); if (*detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; if (*readonly) md.md_flags |= G_ELI_FLAG_RO; if (!dryrun) g_eli_create(req, mp, pp, &md, mkey, nkey); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static struct g_eli_softc * g_eli_find_device(struct g_class *mp, const char *prov) { struct g_eli_softc *sc; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; if (strncmp(prov, "/dev/", strlen("/dev/")) == 0) prov += strlen("/dev/"); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); if (pp != NULL && strcmp(pp->name, prov) == 0) return (sc); cp = LIST_FIRST(&gp->consumer); if (cp != NULL && cp->provider != NULL && strcmp(cp->provider->name, prov) == 0) { return (sc); } } return (NULL); } static void g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *force, *last, *nargs, error; const char *prov; char param[16]; int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } last = gctl_get_paraml(req, "last", sizeof(*last)); if (last == NULL) { gctl_error(req, "No '%s' argument.", "last"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { gctl_error(req, "No such device: %s.", prov); return; } if (*last) { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; sc->sc_geom->access = g_eli_access; } else { error = g_eli_destroy(sc, *force ? TRUE : FALSE); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } } static void g_eli_ctl_onetime(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; const char *name; intmax_t *keylen, *sectorsize; u_char mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *noautoresize, *notrim; g_topology_assert(); bzero(&md, sizeof(md)); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_ELI_MAGIC, sizeof(md.md_magic)); md.md_version = G_ELI_VERSION; md.md_flags |= G_ELI_FLAG_ONETIME; md.md_flags |= G_ELI_FLAG_AUTORESIZE; detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach != NULL && *detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; noautoresize = gctl_get_paraml(req, "noautoresize", sizeof(*noautoresize)); if (noautoresize != NULL && *noautoresize) md.md_flags &= ~G_ELI_FLAG_AUTORESIZE; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim != NULL && *notrim) md.md_flags |= G_ELI_FLAG_NODELETE; md.md_ealgo = CRYPTO_ALGORITHM_MIN - 1; name = gctl_get_asciiparam(req, "aalgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "aalgo"); return; } if (*name != '\0') { md.md_aalgo = g_eli_str2aalgo(name); if (md.md_aalgo >= CRYPTO_ALGORITHM_MIN && md.md_aalgo <= CRYPTO_ALGORITHM_MAX) { md.md_flags |= G_ELI_FLAG_AUTH; } else { /* * For backward compatibility, check if the -a option * was used to provide encryption algorithm. */ md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid authentication algorithm."); return; } else { gctl_error(req, "warning: The -e option, not " "the -a option is now used to specify " "encryption algorithm to use."); } } } if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { name = gctl_get_asciiparam(req, "ealgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "ealgo"); return; } md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid encryption algorithm."); return; } } keylen = gctl_get_paraml(req, "keylen", sizeof(*keylen)); if (keylen == NULL) { gctl_error(req, "No '%s' argument.", "keylen"); return; } md.md_keylen = g_eli_keylen(md.md_ealgo, *keylen); if (md.md_keylen == 0) { gctl_error(req, "Invalid '%s' argument.", "keylen"); return; } /* Not important here. */ md.md_provsize = 0; /* Not important here. */ bzero(md.md_salt, sizeof(md.md_salt)); md.md_keys = 0x01; arc4rand(mkey, sizeof(mkey), 0); /* Not important here. */ bzero(md.md_hash, sizeof(md.md_hash)); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } sectorsize = gctl_get_paraml(req, "sectorsize", sizeof(*sectorsize)); if (sectorsize == NULL) { gctl_error(req, "No '%s' argument.", "sectorsize"); return; } if (*sectorsize == 0) md.md_sectorsize = pp->sectorsize; else { if (*sectorsize < 0 || (*sectorsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid sector size."); return; } if (*sectorsize > PAGE_SIZE) { gctl_error(req, "warning: Using sectorsize bigger than " "the page size!"); } md.md_sectorsize = *sectorsize; } g_eli_create(req, mp, pp, &md, mkey, -1); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static void g_eli_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; char param[16]; const char *prov; u_char *sector; int *nargs, *boot, *noboot, *trim, *notrim, *geliboot, *nogeliboot; int *displaypass, *nodisplaypass, *autoresize, *noautoresize; int zero, error, changed; u_int i; g_topology_assert(); changed = 0; zero = 0; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } boot = gctl_get_paraml(req, "boot", sizeof(*boot)); if (boot == NULL) boot = &zero; noboot = gctl_get_paraml(req, "noboot", sizeof(*noboot)); if (noboot == NULL) noboot = &zero; if (*boot && *noboot) { gctl_error(req, "Options -b and -B are mutually exclusive."); return; } if (*boot || *noboot) changed = 1; trim = gctl_get_paraml(req, "trim", sizeof(*trim)); if (trim == NULL) trim = &zero; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim == NULL) notrim = &zero; if (*trim && *notrim) { gctl_error(req, "Options -t and -T are mutually exclusive."); return; } if (*trim || *notrim) changed = 1; geliboot = gctl_get_paraml(req, "geliboot", sizeof(*geliboot)); if (geliboot == NULL) geliboot = &zero; nogeliboot = gctl_get_paraml(req, "nogeliboot", sizeof(*nogeliboot)); if (nogeliboot == NULL) nogeliboot = &zero; if (*geliboot && *nogeliboot) { gctl_error(req, "Options -g and -G are mutually exclusive."); return; } if (*geliboot || *nogeliboot) changed = 1; displaypass = gctl_get_paraml(req, "displaypass", sizeof(*displaypass)); if (displaypass == NULL) displaypass = &zero; nodisplaypass = gctl_get_paraml(req, "nodisplaypass", sizeof(*nodisplaypass)); if (nodisplaypass == NULL) nodisplaypass = &zero; if (*displaypass && *nodisplaypass) { gctl_error(req, "Options -d and -D are mutually exclusive."); return; } if (*displaypass || *nodisplaypass) changed = 1; autoresize = gctl_get_paraml(req, "autoresize", sizeof(*autoresize)); if (autoresize == NULL) autoresize = &zero; noautoresize = gctl_get_paraml(req, "noautoresize", sizeof(*noautoresize)); if (noautoresize == NULL) noautoresize = &zero; if (*autoresize && *noautoresize) { gctl_error(req, "Options -r and -R are mutually exclusive."); return; } if (*autoresize || *noautoresize) changed = 1; if (!changed) { gctl_error(req, "No option given."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { /* * We ignore not attached providers, userland part will * take care of them. */ G_ELI_DEBUG(1, "Skipping configuration of not attached " "provider %s.", prov); continue; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change configuration of " "read-only provider %s.", prov); continue; } if (*boot && (sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag already configured for %s.", prov); continue; } else if (*noboot && !(sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag not configured for %s.", prov); continue; } if (*notrim && (sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag already configured for %s.", prov); continue; } else if (*trim && !(sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag not configured for %s.", prov); continue; } if (*geliboot && (sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag already configured for %s.", prov); continue; } else if (*nogeliboot && !(sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag not configured for %s.", prov); continue; } if (*displaypass && (sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag already configured for %s.", prov); continue; } else if (*nodisplaypass && !(sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag not configured for %s.", prov); continue; } if (*autoresize && (sc->sc_flags & G_ELI_FLAG_AUTORESIZE)) { G_ELI_DEBUG(1, "AUTORESIZE flag already configured for %s.", prov); continue; } else if (*noautoresize && !(sc->sc_flags & G_ELI_FLAG_AUTORESIZE)) { G_ELI_DEBUG(1, "AUTORESIZE flag not configured for %s.", prov); continue; } if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { /* * ONETIME providers don't write metadata to * disk, so don't try reading it. This means * we're bit-flipping uninitialized memory in md * below, but that's OK; we don't do anything * with it later. */ cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", prov, error); continue; } } if (*boot) { md.md_flags |= G_ELI_FLAG_BOOT; sc->sc_flags |= G_ELI_FLAG_BOOT; } else if (*noboot) { md.md_flags &= ~G_ELI_FLAG_BOOT; sc->sc_flags &= ~G_ELI_FLAG_BOOT; } if (*notrim) { md.md_flags |= G_ELI_FLAG_NODELETE; sc->sc_flags |= G_ELI_FLAG_NODELETE; } else if (*trim) { md.md_flags &= ~G_ELI_FLAG_NODELETE; sc->sc_flags &= ~G_ELI_FLAG_NODELETE; } if (*geliboot) { md.md_flags |= G_ELI_FLAG_GELIBOOT; sc->sc_flags |= G_ELI_FLAG_GELIBOOT; } else if (*nogeliboot) { md.md_flags &= ~G_ELI_FLAG_GELIBOOT; sc->sc_flags &= ~G_ELI_FLAG_GELIBOOT; } if (*displaypass) { md.md_flags |= G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags |= G_ELI_FLAG_GELIDISPLAYPASS; } else if (*nodisplaypass) { md.md_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; } if (*autoresize) { md.md_flags |= G_ELI_FLAG_AUTORESIZE; sc->sc_flags |= G_ELI_FLAG_AUTORESIZE; } else if (*noautoresize) { md.md_flags &= ~G_ELI_FLAG_AUTORESIZE; sc->sc_flags &= ~G_ELI_FLAG_AUTORESIZE; } if (sc->sc_flags & G_ELI_FLAG_ONETIME) { /* There's no metadata on disk so we are done here. */ continue; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", prov, error); } explicit_bzero(&md, sizeof(md)); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); } } static void g_eli_ctl_setkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, *mkeydst, *sector; intmax_t *valp; int keysize, nkey, error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } valp = gctl_get_paraml(req, "iterations", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "iterations"); return; } /* Check if iterations number should and can be changed. */ if (*valp != -1 && md.md_iterations == -1) { md.md_iterations = *valp; } else if (*valp != -1 && *valp != md.md_iterations) { if (bitcount32(md.md_keys) != 1) { gctl_error(req, "To be able to use '-i' option, only " "one key can be defined."); return; } if (md.md_keys != (1 << nkey)) { gctl_error(req, "Only already defined key can be " "changed when '-i' option is used."); return; } md.md_iterations = *valp; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; md.md_keys |= (1 << nkey); bcopy(sc->sc_mkey, mkeydst, sizeof(sc->sc_mkey)); /* Encrypt Master Key with the new key. */ error = g_eli_mkey_encrypt(md.md_ealgo, key, md.md_keylen, mkeydst); explicit_bzero(key, keysize); if (error != 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot encrypt Master Key (error=%d).", error); return; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); /* Store metadata with fresh key. */ eli_metadata_encode(&md, sector); explicit_bzero(&md, sizeof(md)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Key %u changed on %s.", nkey, pp->name); } static void g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *mkeydst, *sector; intmax_t *valp; size_t keysize; int error, nkey, *all, *force; u_int i; g_topology_assert(); nkey = 0; /* fixes causeless gcc warning */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot delete keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (*all) { mkeydst = md.md_mkeys; keysize = sizeof(md.md_mkeys); } else { force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } if (!(md.md_keys & (1 << nkey)) && !*force) { gctl_error(req, "Master Key %u is not set.", nkey); return; } md.md_keys &= ~(1 << nkey); if (md.md_keys == 0 && !*force) { gctl_error(req, "This is the last Master Key. Use '-f' " "flag if you really want to remove it."); return; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; keysize = G_ELI_MKEYLEN; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) explicit_bzero(mkeydst, keysize); else arc4rand(mkeydst, keysize, 0); /* Store metadata with destroyed key. */ eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot store metadata on %s " "(error=%d).", pp->name, error); } /* * Flush write cache so we don't overwrite data N times in cache * and only once on disk. */ (void)g_io_flush(cp); } explicit_bzero(&md, sizeof(md)); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); if (*all) G_ELI_DEBUG(1, "All keys removed from %s.", pp->name); else G_ELI_DEBUG(1, "Key %d removed from %s.", nkey, pp->name); } static void g_eli_suspend_one(struct g_eli_softc *sc, struct gctl_req *req) { struct g_eli_worker *wr; g_topology_assert(); KASSERT(sc != NULL, ("NULL sc")); if (sc->sc_flags & G_ELI_FLAG_ONETIME) { gctl_error(req, "Device %s is using one-time key, suspend not supported.", sc->sc_name); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { mtx_unlock(&sc->sc_queue_mtx); gctl_error(req, "Device %s already suspended.", sc->sc_name); return; } sc->sc_flags |= G_ELI_FLAG_SUSPEND; wakeup(sc); for (;;) { LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_active) break; } if (wr == NULL) break; /* Not all threads suspended. */ msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); } /* * Clear sensitive data on suspend, they will be recovered on resume. */ explicit_bzero(sc->sc_mkey, sizeof(sc->sc_mkey)); g_eli_key_destroy(sc); explicit_bzero(sc->sc_akey, sizeof(sc->sc_akey)); explicit_bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx)); explicit_bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey)); explicit_bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx)); mtx_unlock(&sc->sc_queue_mtx); G_ELI_DEBUG(0, "Device %s has been suspended.", sc->sc_name); } static void g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *all, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc->sc_flags & G_ELI_FLAG_ONETIME) { G_ELI_DEBUG(0, "Device %s is using one-time key, suspend not supported, skipping.", sc->sc_name); continue; } g_eli_suspend_one(sc, req); } } else { const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } g_eli_suspend_one(sc, req); } } } static void g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_eli_softc *sc; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, keysize, error; u_int nkey; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); mtx_lock(&sc->sc_queue_mtx); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) gctl_error(req, "Device %s is not suspended.", name); else { /* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */ g_eli_mkey_propagate(sc, mkey); sc->sc_flags &= ~G_ELI_FLAG_SUSPEND; G_ELI_DEBUG(1, "Resumed %s.", pp->name); wakeup(sc); } mtx_unlock(&sc->sc_queue_mtx); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static int g_eli_kill_one(struct g_eli_softc *sc) { struct g_provider *pp; struct g_consumer *cp; int error = 0; g_topology_assert(); if (sc == NULL) return (ENOENT); pp = LIST_FIRST(&sc->sc_geom->provider); g_error_provider(pp, ENXIO); cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; if (sc->sc_flags & G_ELI_FLAG_RO) { G_ELI_DEBUG(0, "WARNING: Metadata won't be erased on read-only " "provider: %s.", pp->name); } else { u_char *sector; u_int i; int err; sector = malloc(pp->sectorsize, M_ELI, M_WAITOK); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) bzero(sector, pp->sectorsize); else arc4rand(sector, pp->sectorsize, 0); err = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (err != 0) { G_ELI_DEBUG(0, "Cannot erase metadata on %s " "(error=%d).", pp->name, err); if (error == 0) error = err; } /* * Flush write cache so we don't overwrite data N times * in cache and only once on disk. */ (void)g_io_flush(cp); } free(sector, M_ELI); } if (error == 0) G_ELI_DEBUG(0, "%s has been killed.", pp->name); g_eli_destroy(sc, TRUE); return (error); } static void g_eli_ctl_kill(struct gctl_req *req, struct g_class *mp) { int *all, *nargs; int error; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { error = g_eli_kill_one(gp->softc); if (error != 0) gctl_error(req, "Not fully done."); } } else { struct g_eli_softc *sc; const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } error = g_eli_kill_one(sc); if (error != 0) gctl_error(req, "Not fully done."); } } } void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } while (*version != G_ELI_VERSION) { if (G_ELI_VERSION == G_ELI_VERSION_06 && *version == G_ELI_VERSION_05) { /* Compatible. */ break; } if (G_ELI_VERSION == G_ELI_VERSION_07 && (*version == G_ELI_VERSION_05 || *version == G_ELI_VERSION_06)) { /* Compatible. */ break; } gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "attach") == 0) g_eli_ctl_attach(req, mp); else if (strcmp(verb, "detach") == 0 || strcmp(verb, "stop") == 0) g_eli_ctl_detach(req, mp); else if (strcmp(verb, "onetime") == 0) g_eli_ctl_onetime(req, mp); else if (strcmp(verb, "configure") == 0) g_eli_ctl_configure(req, mp); else if (strcmp(verb, "setkey") == 0) g_eli_ctl_setkey(req, mp); else if (strcmp(verb, "delkey") == 0) g_eli_ctl_delkey(req, mp); else if (strcmp(verb, "suspend") == 0) g_eli_ctl_suspend(req, mp); else if (strcmp(verb, "resume") == 0) g_eli_ctl_resume(req, mp); else if (strcmp(verb, "kill") == 0) g_eli_ctl_kill(req, mp); else gctl_error(req, "Unknown verb."); } Index: head/sys/geom/eli/g_eli_integrity.c =================================================================== --- head/sys/geom/eli/g_eli_integrity.c (revision 350693) +++ head/sys/geom/eli/g_eli_integrity.c (revision 350694) @@ -1,540 +1,541 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include /* * The data layout description when integrity verification is configured. * * One of the most important assumption here is that authenticated data and its * HMAC has to be stored in the same place (namely in the same sector) to make * it work reliable. * The problem is that file systems work only with sectors that are multiple of * 512 bytes and a power of two number. * My idea to implement it is as follows. * Let's store HMAC in sector. This is a must. This leaves us 480 bytes for * data. We can't use that directly (ie. we can't create provider with 480 bytes * sector size). We need another sector from where we take only 32 bytes of data * and we store HMAC of this data as well. This takes two sectors from the * original provider at the input and leaves us one sector of authenticated data * at the output. Not very efficient, but you got the idea. * Now, let's assume, we want to create provider with 4096 bytes sector. * To output 4096 bytes of authenticated data we need 8x480 plus 1x256, so we * need nine 512-bytes sectors at the input to get one 4096-bytes sector at the * output. That's better. With 4096 bytes sector we can use 89% of size of the * original provider. I find it as an acceptable cost. * The reliability comes from the fact, that every HMAC stored inside the sector * is calculated only for the data in the same sector, so its impossible to * write new data and leave old HMAC or vice versa. * * And here is the picture: * * da0: +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+ * |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |256b | * |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data | * +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+ * |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |288 bytes | * +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ |224 unused| * +----------+ * da0.eli: +----+----+----+----+----+----+----+----+----+ * |480b|480b|480b|480b|480b|480b|480b|480b|256b| * +----+----+----+----+----+----+----+----+----+ * | 4096 bytes | * +--------------------------------------------+ * * PS. You can use any sector size with geli(8). My example is using 4kB, * because it's most efficient. For 8kB sectors you need 2 extra sectors, * so the cost is the same as for 4kB sectors. */ /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> g_eli_auth_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_auth_run -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ MALLOC_DECLARE(M_ELI); /* * Here we generate key for HMAC. Every sector has its own HMAC key, so it is * not possible to copy sectors. * We cannot depend on fact, that every sector has its own IV, because different * IV doesn't change HMAC, when we use encrypt-then-authenticate method. */ static void g_eli_auth_keygen(struct g_eli_softc *sc, off_t offset, u_char *key) { SHA256_CTX ctx; /* Copy precalculated SHA256 context. */ bcopy(&sc->sc_akeyctx, &ctx, sizeof(ctx)); SHA256_Update(&ctx, (uint8_t *)&offset, sizeof(offset)); SHA256_Final(key, &ctx); } /* * The function is called after we read and decrypt data. * * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> G_ELI_AUTH_READ_DONE -> g_io_deliver */ static int g_eli_auth_read_done(struct cryptop *crp) { struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { bp->bio_completed += crp->crp_olen; G_ELI_DEBUG(3, "Crypto READ request done (%d/%d) (add=%jd completed=%jd).", bp->bio_inbed, bp->bio_children, (intmax_t)crp->crp_olen, (intmax_t)bp->bio_completed); } else { G_ELI_DEBUG(1, "Crypto READ request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; g_eli_key_drop(sc, crp->crp_desc->crd_next->crd_key); /* * Do we have all sectors already? */ if (bp->bio_inbed < bp->bio_children) return (0); if (bp->bio_error == 0) { u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize; u_char *srcdata, *dstdata, *auth; off_t coroff, corsize; /* * Verify data integrity based on calculated and read HMACs. */ /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; /* The real sectorsize of encrypted provider, eg. 512. */ encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize; /* Number of data bytes in one encrypted sector, eg. 480. */ data_secsize = sc->sc_data_per_sector; /* Number of sectors from decrypted provider, eg. 2. */ nsec = bp->bio_length / decr_secsize; /* Number of sectors from encrypted provider, eg. 18. */ nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize; /* Last sector number in every big sector, eg. 9. */ lsec = sc->sc_bytes_per_sector / encr_secsize; srcdata = bp->bio_driver2; dstdata = bp->bio_data; auth = srcdata + encr_secsize * nsec; coroff = -1; corsize = 0; for (i = 1; i <= nsec; i++) { data_secsize = sc->sc_data_per_sector; if ((i % lsec) == 0) data_secsize = decr_secsize % data_secsize; if (bcmp(srcdata, auth, sc->sc_alen) != 0) { /* * Curruption detected, remember the offset if * this is the first corrupted sector and * increase size. */ if (bp->bio_error == 0) bp->bio_error = -1; if (coroff == -1) { coroff = bp->bio_offset + (dstdata - (u_char *)bp->bio_data); } corsize += data_secsize; } else { /* * No curruption, good. * Report previous corruption if there was one. */ if (coroff != -1) { G_ELI_DEBUG(0, "%s: Failed to authenticate %jd " "bytes of data at offset %jd.", sc->sc_name, (intmax_t)corsize, (intmax_t)coroff); coroff = -1; corsize = 0; } bcopy(srcdata + sc->sc_alen, dstdata, data_secsize); } srcdata += encr_secsize; dstdata += data_secsize; auth += sc->sc_alen; } /* Report previous corruption if there was one. */ if (coroff != -1) { G_ELI_DEBUG(0, "%s: Failed to authenticate %jd " "bytes of data at offset %jd.", sc->sc_name, (intmax_t)corsize, (intmax_t)coroff); } } free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; if (bp->bio_error != 0) { if (bp->bio_error == -1) bp->bio_error = EINVAL; else { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); } bp->bio_completed = 0; } /* * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } /* * The function is called after data encryption. * * g_eli_start -> g_eli_auth_run -> G_ELI_AUTH_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver */ static int g_eli_auth_write_done(struct cryptop *crp) { struct g_eli_softc *sc; struct g_consumer *cp; struct bio *bp, *cbp, *cbp2; u_int nsec; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).", bp->bio_inbed, bp->bio_children); } else { G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * All sectors are already encrypted? */ if (bp->bio_inbed < bp->bio_children) return (0); if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; cbp->bio_to = cp->provider; cbp->bio_done = g_eli_write_done; /* Number of sectors from decrypted provider, eg. 1. */ nsec = bp->bio_length / bp->bio_to->sectorsize; /* Number of sectors from encrypted provider, eg. 9. */ nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize; cbp->bio_length = cp->provider->sectorsize * nsec; cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; cbp->bio_data = bp->bio_driver2; /* * We write more than what is requested, so we have to be ready to write * more than MAXPHYS. */ cbp2 = NULL; if (cbp->bio_length > MAXPHYS) { cbp2 = g_duplicate_bio(bp); cbp2->bio_length = cbp->bio_length - MAXPHYS; cbp2->bio_data = cbp->bio_data + MAXPHYS; cbp2->bio_offset = cbp->bio_offset + MAXPHYS; cbp2->bio_to = cp->provider; cbp2->bio_done = g_eli_write_done; cbp->bio_length = MAXPHYS; } /* * Send encrypted data to the provider. */ G_ELI_LOGREQ(2, cbp, "Sending request."); bp->bio_inbed = 0; bp->bio_children = (cbp2 != NULL ? 2 : 1); g_io_request(cbp, cp); if (cbp2 != NULL) { G_ELI_LOGREQ(2, cbp2, "Sending request."); g_io_request(cbp2, cp); } return (0); } void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp) { struct g_consumer *cp; struct bio *cbp, *cbp2; size_t size; off_t nsec; bp->bio_pflags = 0; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; cbp->bio_to = cp->provider; cbp->bio_done = g_eli_read_done; /* Number of sectors from decrypted provider, eg. 1. */ nsec = bp->bio_length / bp->bio_to->sectorsize; /* Number of sectors from encrypted provider, eg. 9. */ nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize; cbp->bio_length = cp->provider->sectorsize * nsec; size = cbp->bio_length; size += sc->sc_alen * nsec; size += sizeof(struct cryptop) * nsec; size += sizeof(struct cryptodesc) * nsec * 2; size += G_ELI_AUTH_SECKEYLEN * nsec; cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; bp->bio_driver2 = malloc(size, M_ELI, M_WAITOK); cbp->bio_data = bp->bio_driver2; /* * We read more than what is requested, so we have to be ready to read * more than MAXPHYS. */ cbp2 = NULL; if (cbp->bio_length > MAXPHYS) { cbp2 = g_duplicate_bio(bp); cbp2->bio_length = cbp->bio_length - MAXPHYS; cbp2->bio_data = cbp->bio_data + MAXPHYS; cbp2->bio_offset = cbp->bio_offset + MAXPHYS; cbp2->bio_to = cp->provider; cbp2->bio_done = g_eli_read_done; cbp->bio_length = MAXPHYS; } /* * Read encrypted data from provider. */ G_ELI_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); if (cbp2 != NULL) { G_ELI_LOGREQ(2, cbp2, "Sending request."); g_io_request(cbp2, cp); } } /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). * * BIO_READ: * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp) { struct g_eli_softc *sc; struct cryptop *crp; struct cryptodesc *crde, *crda; u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize; off_t dstoff; u_char *p, *data, *auth, *authkey, *plaindata; int error; G_ELI_LOGREQ(3, bp, "%s", __func__); bp->bio_pflags = wr->w_number; sc = wr->w_softc; /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; /* The real sectorsize of encrypted provider, eg. 512. */ encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize; /* Number of data bytes in one encrypted sector, eg. 480. */ data_secsize = sc->sc_data_per_sector; /* Number of sectors from decrypted provider, eg. 2. */ nsec = bp->bio_length / decr_secsize; /* Number of sectors from encrypted provider, eg. 18. */ nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize; /* Last sector number in every big sector, eg. 9. */ lsec = sc->sc_bytes_per_sector / encr_secsize; /* Destination offset, used for IV generation. */ dstoff = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; auth = NULL; /* Silence compiler warning. */ plaindata = bp->bio_data; if (bp->bio_cmd == BIO_READ) { data = bp->bio_driver2; auth = data + encr_secsize * nsec; p = auth + sc->sc_alen * nsec; } else { size_t size; size = encr_secsize * nsec; size += sizeof(*crp) * nsec; size += sizeof(*crde) * nsec; size += sizeof(*crda) * nsec; size += G_ELI_AUTH_SECKEYLEN * nsec; size += sizeof(uintptr_t); /* Space for alignment. */ data = malloc(size, M_ELI, M_WAITOK); bp->bio_driver2 = data; p = data + encr_secsize * nsec; } bp->bio_inbed = 0; bp->bio_children = nsec; #if defined(__mips_n64) || defined(__mips_o64) p = (char *)roundup((uintptr_t)p, sizeof(uintptr_t)); #endif for (i = 1; i <= nsec; i++, dstoff += encr_secsize) { crp = (struct cryptop *)p; p += sizeof(*crp); crde = (struct cryptodesc *)p; p += sizeof(*crde); crda = (struct cryptodesc *)p; p += sizeof(*crda); authkey = (u_char *)p; p += G_ELI_AUTH_SECKEYLEN; data_secsize = sc->sc_data_per_sector; if ((i % lsec) == 0) { data_secsize = decr_secsize % data_secsize; /* * Last encrypted sector of each decrypted sector is * only partially filled. */ if (bp->bio_cmd == BIO_WRITE) memset(data + sc->sc_alen + data_secsize, 0, encr_secsize - sc->sc_alen - data_secsize); } if (bp->bio_cmd == BIO_READ) { /* Remember read HMAC. */ bcopy(data, auth, sc->sc_alen); auth += sc->sc_alen; /* TODO: bzero(9) can be commented out later. */ bzero(data, sc->sc_alen); } else { bcopy(plaindata, data + sc->sc_alen, data_secsize); plaindata += data_secsize; } crp->crp_session = wr->w_sid; crp->crp_ilen = sc->sc_alen + data_secsize; crp->crp_olen = data_secsize; crp->crp_opaque = (void *)bp; crp->crp_buf = (void *)data; data += encr_secsize; crp->crp_flags = CRYPTO_F_CBIFSYNC; if (g_eli_batch) crp->crp_flags |= CRYPTO_F_BATCH; if (bp->bio_cmd == BIO_WRITE) { crp->crp_callback = g_eli_auth_write_done; crp->crp_desc = crde; crde->crd_next = crda; crda->crd_next = NULL; } else { crp->crp_callback = g_eli_auth_read_done; crp->crp_desc = crda; crda->crd_next = crde; crde->crd_next = NULL; } crde->crd_skip = sc->sc_alen; crde->crd_len = data_secsize; crde->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) == 0) crde->crd_flags |= CRD_F_KEY_EXPLICIT; if (bp->bio_cmd == BIO_WRITE) crde->crd_flags |= CRD_F_ENCRYPT; crde->crd_alg = sc->sc_ealgo; crde->crd_key = g_eli_key_hold(sc, dstoff, encr_secsize); crde->crd_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crde->crd_klen <<= 1; g_eli_crypto_ivgen(sc, dstoff, crde->crd_iv, sizeof(crde->crd_iv)); crda->crd_skip = sc->sc_alen; crda->crd_len = data_secsize; crda->crd_inject = 0; crda->crd_flags = CRD_F_KEY_EXPLICIT; crda->crd_alg = sc->sc_aalgo; g_eli_auth_keygen(sc, dstoff, authkey); crda->crd_key = authkey; crda->crd_klen = G_ELI_AUTH_SECKEYLEN * 8; crp->crp_etype = 0; error = crypto_dispatch(crp); KASSERT(error == 0, ("crypto_dispatch() failed (error=%d)", error)); } } Index: head/sys/geom/eli/g_eli_privacy.c =================================================================== --- head/sys/geom/eli/g_eli_privacy.c (revision 350693) +++ head/sys/geom/eli/g_eli_privacy.c (revision 350694) @@ -1,318 +1,319 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ MALLOC_DECLARE(M_ELI); /* * The function is called after we read and decrypt data. * * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver */ static int g_eli_crypto_read_done(struct cryptop *crp) { struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto READ request done (%d/%d).", bp->bio_inbed, bp->bio_children); bp->bio_completed += crp->crp_olen; } else { G_ELI_DEBUG(1, "Crypto READ request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; if (sc != NULL) g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * Do we have all sectors already? */ if (bp->bio_inbed < bp->bio_children) return (0); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); bp->bio_completed = 0; } /* * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); return (0); } /* * The function is called after data encryption. * * g_eli_start -> g_eli_crypto_run -> G_ELI_CRYPTO_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver */ static int g_eli_crypto_write_done(struct cryptop *crp) { struct g_eli_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *bp, *cbp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).", bp->bio_inbed, bp->bio_children); } else { G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } gp = bp->bio_to->geom; sc = gp->softc; g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * All sectors are already encrypted? */ if (bp->bio_inbed < bp->bio_children) return (0); bp->bio_inbed = 0; bp->bio_children = 1; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } cbp->bio_data = bp->bio_driver2; cbp->bio_done = g_eli_write_done; cp = LIST_FIRST(&gp->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); /* * Send encrypted data to the provider. */ g_io_request(cbp, cp); return (0); } /* * The function is called to read encrypted data. * * g_eli_start -> G_ELI_CRYPTO_READ -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker) { struct g_consumer *cp; struct bio *cbp; if (!fromworker) { /* * We are not called from the worker thread, so check if * device is suspended. */ mtx_lock(&sc->sc_queue_mtx); if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { /* * If device is suspended, we place the request onto * the queue, so it can be handled after resume. */ G_ELI_DEBUG(0, "device suspended, move onto queue"); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); return; } atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); } bp->bio_pflags = 0; bp->bio_driver2 = NULL; cbp = bp->bio_driver1; cbp->bio_done = g_eli_read_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); /* * Read encrypted data from provider. */ g_io_request(cbp, cp); } /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). * * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) { struct g_eli_softc *sc; struct cryptop *crp; struct cryptodesc *crd; u_int i, nsec, secsize; off_t dstoff; size_t size; u_char *p, *data; int error; G_ELI_LOGREQ(3, bp, "%s", __func__); bp->bio_pflags = wr->w_number; sc = wr->w_softc; secsize = LIST_FIRST(&sc->sc_geom->provider)->sectorsize; nsec = bp->bio_length / secsize; /* * Calculate how much memory do we need. * We need separate crypto operation for every single sector. * It is much faster to calculate total amount of needed memory here and * do the allocation once instead of allocating memory in pieces (many, * many pieces). */ size = sizeof(*crp) * nsec; size += sizeof(*crd) * nsec; /* * If we write the data we cannot destroy current bio_data content, * so we need to allocate more memory for encrypted data. */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; p = malloc(size, M_ELI, M_WAITOK); bp->bio_inbed = 0; bp->bio_children = nsec; bp->bio_driver2 = p; if (bp->bio_cmd == BIO_READ) data = bp->bio_data; else { data = p; p += bp->bio_length; bcopy(bp->bio_data, data, bp->bio_length); } for (i = 0, dstoff = bp->bio_offset; i < nsec; i++, dstoff += secsize) { crp = (struct cryptop *)p; p += sizeof(*crp); crd = (struct cryptodesc *)p; p += sizeof(*crd); crp->crp_session = wr->w_sid; crp->crp_ilen = secsize; crp->crp_olen = secsize; crp->crp_opaque = (void *)bp; crp->crp_buf = (void *)data; data += secsize; if (bp->bio_cmd == BIO_WRITE) crp->crp_callback = g_eli_crypto_write_done; else /* if (bp->bio_cmd == BIO_READ) */ crp->crp_callback = g_eli_crypto_read_done; crp->crp_flags = CRYPTO_F_CBIFSYNC; if (g_eli_batch) crp->crp_flags |= CRYPTO_F_BATCH; crp->crp_desc = crd; crd->crd_skip = 0; crd->crd_len = secsize; crd->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) == 0) crd->crd_flags |= CRD_F_KEY_EXPLICIT; if (bp->bio_cmd == BIO_WRITE) crd->crd_flags |= CRD_F_ENCRYPT; crd->crd_alg = sc->sc_ealgo; crd->crd_key = g_eli_key_hold(sc, dstoff, secsize); crd->crd_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crd->crd_klen <<= 1; g_eli_crypto_ivgen(sc, dstoff, crd->crd_iv, sizeof(crd->crd_iv)); crd->crd_next = NULL; crp->crp_etype = 0; error = crypto_dispatch(crp); KASSERT(error == 0, ("crypto_dispatch() failed (error=%d)", error)); } } Index: head/sys/geom/gate/g_gate.c =================================================================== --- head/sys/geom/gate/g_gate.c (revision 350693) +++ head/sys/geom/gate/g_gate.c (revision 350694) @@ -1,967 +1,968 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Pawel Jakub Dawidek * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_gate, "GEOM Gate module"); static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0, "GEOM_GATE configuration"); static int g_gate_debug = 0; SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RWTUN, &g_gate_debug, 0, "Debug level"); static u_int g_gate_maxunits = 256; SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN, &g_gate_maxunits, 0, "Maximum number of ggate devices"); struct g_class g_gate_class = { .name = G_GATE_CLASS_NAME, .version = G_VERSION, }; static struct cdev *status_dev; static d_ioctl_t g_gate_ioctl; static struct cdevsw g_gate_cdevsw = { .d_version = D_VERSION, .d_ioctl = g_gate_ioctl, .d_name = G_GATE_CTL_NAME }; static struct g_gate_softc **g_gate_units; static u_int g_gate_nunits; static struct mtx g_gate_units_lock; static int g_gate_destroy(struct g_gate_softc *sc, boolean_t force) { struct bio_queue_head queue; struct g_provider *pp; struct g_consumer *cp; struct g_geom *gp; struct bio *bp; g_topology_assert(); mtx_assert(&g_gate_units_lock, MA_OWNED); pp = sc->sc_provider; if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { mtx_unlock(&g_gate_units_lock); return (EBUSY); } mtx_unlock(&g_gate_units_lock); mtx_lock(&sc->sc_queue_mtx); if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) sc->sc_flags |= G_GATE_FLAG_DESTROY; wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); gp = pp->geom; g_wither_provider(pp, ENXIO); callout_drain(&sc->sc_callout); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_inqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } while ((bp = bioq_takefirst(&sc->sc_outqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); g_topology_unlock(); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request canceled."); g_io_deliver(bp, ENXIO); } mtx_lock(&g_gate_units_lock); /* One reference is ours. */ sc->sc_ref--; while (sc->sc_ref > 0) msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); g_topology_lock(); if ((cp = sc->sc_readcons) != NULL) { sc->sc_readcons = NULL; (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } G_GATE_DEBUG(1, "Device %s destroyed.", gp->name); gp->softc = NULL; g_wither_geom(gp, ENXIO); sc->sc_provider = NULL; free(sc, M_GATE); return (0); } static int g_gate_access(struct g_provider *pp, int dr, int dw, int de) { struct g_gate_softc *sc; if (dr <= 0 && dw <= 0 && de <= 0) return (0); sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) return (ENXIO); /* XXX: Hack to allow read-only mounts. */ #if 0 if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0 && dw > 0) return (EPERM); #endif if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0 && dr > 0) return (EPERM); return (0); } static void g_gate_queue_io(struct bio *bp) { struct g_gate_softc *sc; sc = bp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(bp, ENXIO); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_queue_size > 0 && sc->sc_queue_count > sc->sc_queue_size) { mtx_unlock(&sc->sc_queue_mtx); G_GATE_LOGREQ(1, bp, "Queue full, request canceled."); g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = (void *)sc->sc_seq; sc->sc_seq++; sc->sc_queue_count++; bioq_insert_tail(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } static void g_gate_done(struct bio *cbp) { struct bio *pbp; pbp = cbp->bio_parent; if (cbp->bio_error == 0) { pbp->bio_completed = cbp->bio_completed; g_destroy_bio(cbp); pbp->bio_inbed++; g_io_deliver(pbp, 0); } else { /* If direct read failed, pass it through userland daemon. */ g_destroy_bio(cbp); pbp->bio_children--; g_gate_queue_io(pbp); } } static void g_gate_start(struct bio *pbp) { struct g_gate_softc *sc; sc = pbp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(pbp, ENXIO); return; } G_GATE_LOGREQ(2, pbp, "Request received."); switch (pbp->bio_cmd) { case BIO_READ: if (sc->sc_readcons != NULL) { struct bio *cbp; cbp = g_clone_bio(pbp); if (cbp == NULL) { g_io_deliver(pbp, ENOMEM); return; } cbp->bio_done = g_gate_done; cbp->bio_offset = pbp->bio_offset + sc->sc_readoffset; cbp->bio_to = sc->sc_readcons->provider; g_io_request(cbp, sc->sc_readcons); return; } break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: /* XXX: Hack to allow read-only mounts. */ if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { g_io_deliver(pbp, EPERM); return; } break; case BIO_GETATTR: default: G_GATE_LOGREQ(2, pbp, "Ignoring request."); g_io_deliver(pbp, EOPNOTSUPP); return; } g_gate_queue_io(pbp); } static struct g_gate_softc * g_gate_hold(int unit, const char *name) { struct g_gate_softc *sc = NULL; mtx_lock(&g_gate_units_lock); if (unit >= 0 && unit < g_gate_maxunits) sc = g_gate_units[unit]; else if (unit == G_GATE_NAME_GIVEN) { KASSERT(name != NULL, ("name is NULL")); for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_provider->name) != 0) { continue; } sc = g_gate_units[unit]; break; } } if (sc != NULL) sc->sc_ref++; mtx_unlock(&g_gate_units_lock); return (sc); } static void g_gate_release(struct g_gate_softc *sc) { g_topology_assert_not(); mtx_lock(&g_gate_units_lock); sc->sc_ref--; KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name)); if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) wakeup(&sc->sc_ref); mtx_unlock(&g_gate_units_lock); } static int g_gate_getunit(int unit, int *errorp) { mtx_assert(&g_gate_units_lock, MA_OWNED); if (unit >= 0) { if (unit >= g_gate_maxunits) *errorp = EINVAL; else if (g_gate_units[unit] == NULL) return (unit); else *errorp = EEXIST; } else { for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) return (unit); } *errorp = ENFILE; } return (-1); } static void g_gate_guard(void *arg) { struct bio_queue_head queue; struct g_gate_softc *sc; struct bintime curtime; struct bio *bp, *bp2; sc = arg; binuptime(&curtime); g_gate_hold(sc->sc_unit, NULL); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_inqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request timeout."); g_io_deliver(bp, EIO); } if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } g_gate_release(sc); } static void g_gate_orphan(struct g_consumer *cp) { struct g_gate_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; KASSERT(cp == sc->sc_readcons, ("cp=%p sc_readcons=%p", cp, sc->sc_readcons)); sc->sc_readcons = NULL; G_GATE_DEBUG(1, "Destroying read consumer on provider %s orphan.", cp->provider->name); (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } static void g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_gate_softc *sc; sc = gp->softc; if (sc == NULL || pp != NULL || cp != NULL) return; sc = g_gate_hold(sc->sc_unit, NULL); if (sc == NULL) return; if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "read-only"); } else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "write-only"); } else { sbuf_printf(sb, "%s%s\n", indent, "read-write"); } if (sc->sc_readcons != NULL) { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_readoffset); sbuf_printf(sb, "%s%s\n", indent, sc->sc_readcons->provider->name); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_timeout); sbuf_printf(sb, "%s%s\n", indent, sc->sc_info); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_count); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_size); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ref); sbuf_printf(sb, "%s%d\n", indent, sc->sc_unit); g_topology_unlock(); g_gate_release(sc); g_topology_lock(); } static int g_gate_create(struct g_gate_ctl_create *ggio) { struct g_gate_softc *sc; struct g_geom *gp; struct g_provider *pp, *ropp; struct g_consumer *cp; char name[NAME_MAX]; int error = 0, unit; if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if (ggio->gctl_sectorsize <= 0) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if (!powerof2(ggio->gctl_sectorsize)) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if ((ggio->gctl_mediasize % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if ((ggio->gctl_flags & G_GATE_FLAG_READONLY) != 0 && (ggio->gctl_flags & G_GATE_FLAG_WRITEONLY) != 0) { G_GATE_DEBUG(1, "Invalid flags."); return (EINVAL); } if (ggio->gctl_unit != G_GATE_UNIT_AUTO && ggio->gctl_unit != G_GATE_NAME_GIVEN && ggio->gctl_unit < 0) { G_GATE_DEBUG(1, "Invalid unit number."); return (EINVAL); } if (ggio->gctl_unit == G_GATE_NAME_GIVEN && ggio->gctl_name[0] == '\0') { G_GATE_DEBUG(1, "No device name."); return (EINVAL); } sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO); sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS); strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); sc->sc_seq = 1; bioq_init(&sc->sc_inqueue); bioq_init(&sc->sc_outqueue); mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF); sc->sc_queue_count = 0; sc->sc_queue_size = ggio->gctl_maxcount; if (sc->sc_queue_size > G_GATE_MAX_QUEUE_SIZE) sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE; sc->sc_timeout = ggio->gctl_timeout; callout_init(&sc->sc_callout, 1); mtx_lock(&g_gate_units_lock); sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error); if (sc->sc_unit < 0) goto fail1; if (ggio->gctl_unit == G_GATE_NAME_GIVEN) snprintf(name, sizeof(name), "%s", ggio->gctl_name); else { snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME, sc->sc_unit); } /* Check for name collision. */ for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_name) != 0) continue; error = EEXIST; goto fail1; } sc->sc_name = name; g_gate_units[sc->sc_unit] = sc; g_gate_nunits++; mtx_unlock(&g_gate_units_lock); g_topology_lock(); if (ggio->gctl_readprov[0] == '\0') { ropp = NULL; } else { ropp = g_provider_by_name(ggio->gctl_readprov); if (ropp == NULL) { G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); error = EINVAL; goto fail2; } if ((ggio->gctl_readoffset % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); error = EINVAL; goto fail2; } if (ggio->gctl_mediasize + ggio->gctl_readoffset > ropp->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); error = EINVAL; goto fail2; } } gp = g_new_geomf(&g_gate_class, "%s", name); gp->start = g_gate_start; gp->access = g_gate_access; gp->orphan = g_gate_orphan; gp->dumpconf = g_gate_dumpconf; gp->softc = sc; if (ropp != NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, ropp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", ropp->name); goto fail3; } error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", ropp->name); g_detach(cp); goto fail3; } sc->sc_readcons = cp; sc->sc_readoffset = ggio->gctl_readoffset; } ggio->gctl_unit = sc->sc_unit; pp = g_new_providerf(gp, "%s", name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; pp->mediasize = ggio->gctl_mediasize; pp->sectorsize = ggio->gctl_sectorsize; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); mtx_lock(&g_gate_units_lock); sc->sc_name = sc->sc_provider->name; mtx_unlock(&g_gate_units_lock); G_GATE_DEBUG(1, "Device %s created.", gp->name); if (sc->sc_timeout > 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } return (0); fail3: g_destroy_consumer(cp); g_destroy_geom(gp); fail2: g_topology_unlock(); mtx_lock(&g_gate_units_lock); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; fail1: mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); free(sc, M_GATE); return (error); } static int g_gate_modify(struct g_gate_softc *sc, struct g_gate_ctl_modify *ggio) { struct g_provider *pp; struct g_consumer *cp; int error; if ((ggio->gctl_modify & GG_MODIFY_MEDIASIZE) != 0) { if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_mediasize % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } g_resize_provider(pp, ggio->gctl_mediasize); return (0); } if ((ggio->gctl_modify & GG_MODIFY_INFO) != 0) (void)strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); cp = NULL; if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { g_topology_lock(); if (sc->sc_readcons != NULL) { cp = sc->sc_readcons; sc->sc_readcons = NULL; (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } if (ggio->gctl_readprov[0] != '\0') { pp = g_provider_by_name(ggio->gctl_readprov); if (pp == NULL) { g_topology_unlock(); G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); return (EINVAL); } cp = g_new_consumer(sc->sc_provider->geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", pp->name); } else { error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", pp->name); g_detach(cp); } } if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } } } else { cp = sc->sc_readcons; } if ((ggio->gctl_modify & GG_MODIFY_READOFFSET) != 0) { if (cp == NULL) { G_GATE_DEBUG(1, "No read provider."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_readoffset % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); return (EINVAL); } if (pp->mediasize + ggio->gctl_readoffset > cp->provider->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); return (EINVAL); } sc->sc_readoffset = ggio->gctl_readoffset; } if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { sc->sc_readcons = cp; g_topology_unlock(); } return (0); } #define G_GATE_CHECK_VERSION(ggio) do { \ if ((ggio)->gctl_version != G_GATE_VERSION) { \ printf("Version mismatch %d != %d.\n", \ ggio->gctl_version, G_GATE_VERSION); \ return (EINVAL); \ } \ } while (0) static int g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct g_gate_softc *sc; struct bio *bp; int error = 0; G_GATE_DEBUG(4, "ioctl(%s, %lx, %p, %x, %p)", devtoname(dev), cmd, addr, flags, td); switch (cmd) { case G_GATE_CMD_CREATE: { struct g_gate_ctl_create *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); error = g_gate_create(ggio); /* * Reset TDP_GEOM flag. * There are pending events for sure, because we just created * new provider and other classes want to taste it, but we * cannot answer on I/O requests until we're here. */ td->td_pflags &= ~TDP_GEOM; return (error); } case G_GATE_CMD_MODIFY: { struct g_gate_ctl_modify *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = g_gate_modify(sc, ggio); g_gate_release(sc); return (error); } case G_GATE_CMD_DESTROY: { struct g_gate_ctl_destroy *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); g_topology_lock(); mtx_lock(&g_gate_units_lock); error = g_gate_destroy(sc, ggio->gctl_force); g_topology_unlock(); if (error != 0) g_gate_release(sc); return (error); } case G_GATE_CMD_CANCEL: { struct g_gate_ctl_cancel *ggio = (void *)addr; struct bio *tbp, *lbp; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); lbp = NULL; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, tbp) { if (ggio->gctl_seq == 0 || ggio->gctl_seq == (uintptr_t)bp->bio_driver1) { G_GATE_LOGREQ(1, bp, "Request canceled."); bioq_remove(&sc->sc_outqueue, bp); /* * Be sure to put requests back onto incoming * queue in the proper order. */ if (lbp == NULL) bioq_insert_head(&sc->sc_inqueue, bp); else { TAILQ_INSERT_AFTER(&sc->sc_inqueue.queue, lbp, bp, bio_queue); } lbp = bp; /* * If only one request was canceled, leave now. */ if (ggio->gctl_seq != 0) break; } } if (ggio->gctl_unit == G_GATE_NAME_GIVEN) ggio->gctl_unit = sc->sc_unit; mtx_unlock(&sc->sc_queue_mtx); g_gate_release(sc); return (error); } case G_GATE_CMD_START: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = 0; for (;;) { mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_inqueue); if (bp != NULL) break; if ((sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { ggio->gctl_error = ECANCELED; mtx_unlock(&sc->sc_queue_mtx); goto start_end; } if (msleep(sc, &sc->sc_queue_mtx, PPAUSE | PDROP | PCATCH, "ggwait", 0) != 0) { ggio->gctl_error = ECANCELED; goto start_end; } } ggio->gctl_cmd = bp->bio_cmd; if (bp->bio_cmd == BIO_WRITE && bp->bio_length > ggio->gctl_length) { mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_length = bp->bio_length; ggio->gctl_error = ENOMEM; goto start_end; } bioq_remove(&sc->sc_inqueue, bp); bioq_insert_tail(&sc->sc_outqueue, bp); mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_seq = (uintptr_t)bp->bio_driver1; ggio->gctl_offset = bp->bio_offset; ggio->gctl_length = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: case BIO_DELETE: case BIO_FLUSH: break; case BIO_WRITE: error = copyout(bp->bio_data, ggio->gctl_data, bp->bio_length); if (error != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_remove(&sc->sc_outqueue, bp); bioq_insert_head(&sc->sc_inqueue, bp); mtx_unlock(&sc->sc_queue_mtx); goto start_end; } break; } start_end: g_gate_release(sc); return (error); } case G_GATE_CMD_DONE: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENOENT); error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_outqueue.queue, bio_queue) { if (ggio->gctl_seq == (uintptr_t)bp->bio_driver1) break; } if (bp != NULL) { bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; } mtx_unlock(&sc->sc_queue_mtx); if (bp == NULL) { /* * Request was probably canceled. */ goto done_end; } if (ggio->gctl_error == EAGAIN) { bp->bio_error = 0; G_GATE_LOGREQ(1, bp, "Request desisted."); mtx_lock(&sc->sc_queue_mtx); sc->sc_queue_count++; bioq_insert_head(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } else { bp->bio_error = ggio->gctl_error; if (bp->bio_error == 0) { bp->bio_completed = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: error = copyin(ggio->gctl_data, bp->bio_data, bp->bio_length); if (error != 0) bp->bio_error = error; break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: break; } } G_GATE_LOGREQ(2, bp, "Request done."); g_io_deliver(bp, bp->bio_error); } done_end: g_gate_release(sc); return (error); } } return (ENOIOCTL); } static void g_gate_device(void) { status_dev = make_dev(&g_gate_cdevsw, 0x0, UID_ROOT, GID_WHEEL, 0600, G_GATE_CTL_NAME); } static int g_gate_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF); g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]), M_GATE, M_WAITOK | M_ZERO); g_gate_nunits = 0; g_gate_device(); break; case MOD_UNLOAD: mtx_lock(&g_gate_units_lock); if (g_gate_nunits > 0) { mtx_unlock(&g_gate_units_lock); error = EBUSY; break; } mtx_unlock(&g_gate_units_lock); mtx_destroy(&g_gate_units_lock); if (status_dev != NULL) destroy_dev(status_dev); free(g_gate_units, M_GATE); break; default: return (EOPNOTSUPP); break; } return (error); } static moduledata_t g_gate_module = { G_GATE_MOD_NAME, g_gate_modevent, NULL }; DECLARE_MODULE(geom_gate, g_gate_module, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); DECLARE_GEOM_CLASS(g_gate_class, g_gate); MODULE_VERSION(geom_gate, 0); Index: head/sys/geom/gate/g_gate.h =================================================================== --- head/sys/geom/gate/g_gate.h (revision 350693) +++ head/sys/geom/gate/g_gate.h (revision 350694) @@ -1,182 +1,164 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2009 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_GATE_H_ #define _G_GATE_H_ #include #include #include #include #include #define G_GATE_CLASS_NAME "GATE" #define G_GATE_PROVIDER_NAME "ggate" #define G_GATE_MOD_NAME "ggate" #define G_GATE_CTL_NAME "ggctl" #define G_GATE_VERSION 3 /* * Maximum number of request that can be stored in * the queue when there are no workers. */ #define G_GATE_MAX_QUEUE_SIZE 4096 #define G_GATE_FLAG_READONLY 0x0001 #define G_GATE_FLAG_WRITEONLY 0x0002 #define G_GATE_FLAG_DESTROY 0x1000 #define G_GATE_USERFLAGS (G_GATE_FLAG_READONLY | G_GATE_FLAG_WRITEONLY) /* * Pick unit number automatically in /dev/ggate. */ #define G_GATE_UNIT_AUTO (-1) /* * Full provider name is given, so don't use ggate. */ #define G_GATE_NAME_GIVEN (-2) #define G_GATE_CMD_CREATE _IOWR('m', 0, struct g_gate_ctl_create) #define G_GATE_CMD_MODIFY _IOWR('m', 1, struct g_gate_ctl_modify) #define G_GATE_CMD_DESTROY _IOWR('m', 2, struct g_gate_ctl_destroy) #define G_GATE_CMD_CANCEL _IOWR('m', 3, struct g_gate_ctl_cancel) #define G_GATE_CMD_START _IOWR('m', 4, struct g_gate_ctl_io) #define G_GATE_CMD_DONE _IOWR('m', 5, struct g_gate_ctl_io) #define G_GATE_INFOSIZE 2048 #ifdef _KERNEL /* * 'P:' means 'Protected by'. */ struct g_gate_softc { char *sc_name; /* P: (read-only) */ int sc_unit; /* P: (read-only) */ int sc_ref; /* P: g_gate_list_mtx */ struct g_provider *sc_provider; /* P: (read-only) */ uint32_t sc_flags; /* P: sc_queue_mtx */ struct bio_queue_head sc_inqueue; /* P: sc_queue_mtx */ struct bio_queue_head sc_outqueue; /* P: sc_queue_mtx */ struct mtx sc_queue_mtx; uint32_t sc_queue_count; /* P: sc_queue_mtx */ uint32_t sc_queue_size; /* P: (read-only) */ u_int sc_timeout; /* P: (read-only) */ struct g_consumer *sc_readcons; /* P: XXX */ off_t sc_readoffset; /* P: XXX */ struct callout sc_callout; /* P: (modified only from callout thread) */ uintptr_t sc_seq; /* P: (modified only from g_down thread) */ LIST_ENTRY(g_gate_softc) sc_next; /* P: g_gate_list_mtx */ char sc_info[G_GATE_INFOSIZE]; /* P: (read-only) */ }; -#define G_GATE_DEBUG(lvl, ...) do { \ - if (g_gate_debug >= (lvl)) { \ - printf("GEOM_GATE"); \ - if (g_gate_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_GATE_LOGREQ(lvl, bp, ...) do { \ - if (g_gate_debug >= (lvl)) { \ - printf("GEOM_GATE"); \ - if (g_gate_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_GATE_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_GATE", g_gate_debug, (lvl), NULL, __VA_ARGS__) +#define G_GATE_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_GATE", g_gate_debug, (lvl), (bp), __VA_ARGS__) #endif /* !_KERNEL */ struct g_gate_ctl_create { u_int gctl_version; off_t gctl_mediasize; u_int gctl_sectorsize; u_int gctl_flags; u_int gctl_maxcount; u_int gctl_timeout; char gctl_name[NAME_MAX]; char gctl_info[G_GATE_INFOSIZE]; char gctl_readprov[NAME_MAX]; off_t gctl_readoffset; int gctl_unit; /* in/out */ }; #define GG_MODIFY_MEDIASIZE 0x01 #define GG_MODIFY_INFO 0x02 #define GG_MODIFY_READPROV 0x04 #define GG_MODIFY_READOFFSET 0x08 struct g_gate_ctl_modify { u_int gctl_version; int gctl_unit; uint32_t gctl_modify; off_t gctl_mediasize; char gctl_info[G_GATE_INFOSIZE]; char gctl_readprov[NAME_MAX]; off_t gctl_readoffset; }; struct g_gate_ctl_destroy { u_int gctl_version; int gctl_unit; int gctl_force; char gctl_name[NAME_MAX]; }; struct g_gate_ctl_cancel { u_int gctl_version; int gctl_unit; uintptr_t gctl_seq; char gctl_name[NAME_MAX]; }; struct g_gate_ctl_io { u_int gctl_version; int gctl_unit; uintptr_t gctl_seq; u_int gctl_cmd; off_t gctl_offset; off_t gctl_length; void *gctl_data; int gctl_error; }; #endif /* !_G_GATE_H_ */ Index: head/sys/geom/geom.h =================================================================== --- head/sys/geom/geom.h (revision 350693) +++ head/sys/geom/geom.h (revision 350694) @@ -1,435 +1,436 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_H_ #define _GEOM_GEOM_H_ #include #include #include #include #include #include #include struct g_class; struct g_geom; struct g_consumer; struct g_provider; struct g_stat; struct thread; struct bio; struct sbuf; struct gctl_req; struct g_configargs; struct disk_zone_args; typedef int g_config_t (struct g_configargs *ca); typedef void g_ctl_req_t (struct gctl_req *, struct g_class *cp, char const *verb); typedef int g_ctl_create_geom_t (struct gctl_req *, struct g_class *cp, struct g_provider *pp); typedef int g_ctl_destroy_geom_t (struct gctl_req *, struct g_class *cp, struct g_geom *gp); typedef int g_ctl_config_geom_t (struct gctl_req *, struct g_geom *gp, const char *verb); typedef void g_init_t (struct g_class *mp); typedef void g_fini_t (struct g_class *mp); typedef struct g_geom * g_taste_t (struct g_class *, struct g_provider *, int flags); typedef int g_ioctl_t(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); #define G_TF_NORMAL 0 #define G_TF_INSIST 1 #define G_TF_TRANSPARENT 2 typedef int g_access_t (struct g_provider *, int, int, int); /* XXX: not sure about the thread arg */ typedef void g_orphan_t (struct g_consumer *); typedef void g_start_t (struct bio *); typedef void g_spoiled_t (struct g_consumer *); typedef void g_attrchanged_t (struct g_consumer *, const char *attr); typedef void g_provgone_t (struct g_provider *); typedef void g_dumpconf_t (struct sbuf *, const char *indent, struct g_geom *, struct g_consumer *, struct g_provider *); typedef void g_resize_t(struct g_consumer *cp); /* * The g_class structure describes a transformation class. In other words * all BSD disklabel handlers share one g_class, all MBR handlers share * one common g_class and so on. * Certain operations are instantiated on the class, most notably the * taste and config_geom functions. */ struct g_class { const char *name; u_int version; u_int spare0; g_taste_t *taste; g_config_t *config; g_ctl_req_t *ctlreq; g_init_t *init; g_fini_t *fini; g_ctl_destroy_geom_t *destroy_geom; /* * Default values for geom methods */ g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare1; void *spare2; /* * The remaining elements are private */ LIST_ENTRY(g_class) class; LIST_HEAD(,g_geom) geom; }; /* * The g_geom_alias is a list node for aliases for the geom name * for device node creation. */ struct g_geom_alias { LIST_ENTRY(g_geom_alias) ga_next; const char *ga_alias; }; #define G_VERSION_00 0x19950323 #define G_VERSION_01 0x20041207 /* add fflag to g_ioctl_t */ #define G_VERSION G_VERSION_01 /* * The g_geom is an instance of a g_class. */ struct g_geom { char *name; struct g_class *class; LIST_ENTRY(g_geom) geom; LIST_HEAD(,g_consumer) consumer; LIST_HEAD(,g_provider) provider; TAILQ_ENTRY(g_geom) geoms; /* XXX: better name */ int rank; g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare0; void *spare1; void *softc; unsigned flags; #define G_GEOM_WITHER 0x01 #define G_GEOM_VOLATILE_BIO 0x02 #define G_GEOM_IN_ACCESS 0x04 #define G_GEOM_ACCESS_WAIT 0x08 LIST_HEAD(,g_geom_alias) aliases; }; /* * The g_bioq is a queue of struct bio's. * XXX: possibly collection point for statistics. * XXX: should (possibly) be collapsed with sys/bio.h::bio_queue_head. */ struct g_bioq { TAILQ_HEAD(, bio) bio_queue; struct mtx bio_queue_lock; int bio_queue_length; }; /* * A g_consumer is an attachment point for a g_provider. One g_consumer * can only be attached to one g_provider, but multiple g_consumers * can be attached to one g_provider. */ struct g_consumer { struct g_geom *geom; LIST_ENTRY(g_consumer) consumer; struct g_provider *provider; LIST_ENTRY(g_consumer) consumers; /* XXX: better name */ int acr, acw, ace; int flags; #define G_CF_SPOILED 0x1 #define G_CF_ORPHAN 0x4 #define G_CF_DIRECT_SEND 0x10 #define G_CF_DIRECT_RECEIVE 0x20 struct devstat *stat; u_int nstart, nend; /* Two fields for the implementing class to use */ void *private; u_int index; }; /* * A g_provider is a "logical disk". */ struct g_provider { char *name; LIST_ENTRY(g_provider) provider; struct g_geom *geom; LIST_HEAD(,g_consumer) consumers; int acr, acw, ace; int error; TAILQ_ENTRY(g_provider) orphan; off_t mediasize; u_int sectorsize; off_t stripesize; off_t stripeoffset; struct devstat *stat; u_int nstart, nend; u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 #define G_PF_ACCEPT_UNMAPPED 0x8 #define G_PF_DIRECT_SEND 0x10 #define G_PF_DIRECT_RECEIVE 0x20 /* Two fields for the implementing class to use */ void *private; u_int index; }; /* * Descriptor of a classifier. We can register a function and * an argument, which is called by g_io_request() on bio's * that are not previously classified. */ struct g_classifier_hook { TAILQ_ENTRY(g_classifier_hook) link; int (*func)(void *arg, struct bio *bp); void *arg; }; /* BIO_GETATTR("GEOM::setstate") argument values. */ #define G_STATE_FAILED 0 #define G_STATE_REBUILD 1 #define G_STATE_RESYNC 2 #define G_STATE_ACTIVE 3 /* geom_dev.c */ struct cdev; void g_dev_print(void); void g_dev_physpath_changed(void); struct g_provider *g_dev_getprovider(struct cdev *dev); /* geom_dump.c */ void g_trace(int level, const char *, ...); # define G_T_TOPOLOGY 1 # define G_T_BIO 2 # define G_T_ACCESS 4 /* geom_event.c */ typedef void g_event_t(void *, int flag); #define EV_CANCEL 1 int g_post_event(g_event_t *func, void *arg, int flag, ...); int g_waitfor_event(g_event_t *func, void *arg, int flag, ...); void g_cancel_event(void *ref); int g_attr_changed(struct g_provider *pp, const char *attr, int flag); int g_media_changed(struct g_provider *pp, int flag); int g_media_gone(struct g_provider *pp, int flag); void g_orphan_provider(struct g_provider *pp, int error); void g_waitidlelock(void); /* geom_subr.c */ int g_access(struct g_consumer *cp, int nread, int nwrite, int nexcl); int g_attach(struct g_consumer *cp, struct g_provider *pp); int g_compare_names(const char *namea, const char *nameb); void g_destroy_consumer(struct g_consumer *cp); void g_destroy_geom(struct g_geom *pp); void g_destroy_provider(struct g_provider *pp); void g_detach(struct g_consumer *cp); void g_error_provider(struct g_provider *pp, int error); struct g_provider *g_provider_by_name(char const *arg); void g_geom_add_alias(struct g_geom *gp, const char *alias); int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len); #define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v)) int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len); int g_handleattr_int(struct bio *bp, const char *attribute, int val); int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val); int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val); int g_handleattr_str(struct bio *bp, const char *attribute, const char *str); struct g_consumer * g_new_consumer(struct g_geom *gp); struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) __printflike(2, 3); struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) __printflike(2, 3); void g_resize_provider(struct g_provider *pp, off_t size); int g_retaste(struct g_class *mp); void g_spoil(struct g_provider *pp, struct g_consumer *cp); int g_std_access(struct g_provider *pp, int dr, int dw, int de); void g_std_done(struct bio *bp); void g_std_spoiled(struct g_consumer *cp); void g_wither_geom(struct g_geom *gp, int error); void g_wither_geom_close(struct g_geom *gp, int error); void g_wither_provider(struct g_provider *pp, int error); #if defined(DIAGNOSTIC) || defined(DDB) int g_valid_obj(void const *ptr); #endif #ifdef DIAGNOSTIC #define G_VALID_CLASS(foo) \ KASSERT(g_valid_obj(foo) == 1, ("%p is not a g_class", foo)) #define G_VALID_GEOM(foo) \ KASSERT(g_valid_obj(foo) == 2, ("%p is not a g_geom", foo)) #define G_VALID_CONSUMER(foo) \ KASSERT(g_valid_obj(foo) == 3, ("%p is not a g_consumer", foo)) #define G_VALID_PROVIDER(foo) \ KASSERT(g_valid_obj(foo) == 4, ("%p is not a g_provider", foo)) #else #define G_VALID_CLASS(foo) do { } while (0) #define G_VALID_GEOM(foo) do { } while (0) #define G_VALID_CONSUMER(foo) do { } while (0) #define G_VALID_PROVIDER(foo) do { } while (0) #endif int g_modevent(module_t, int, void *); /* geom_io.c */ struct bio * g_clone_bio(struct bio *); struct bio * g_duplicate_bio(struct bio *); void g_destroy_bio(struct bio *); void g_io_deliver(struct bio *bp, int error); int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); struct bio *g_new_bio(void); struct bio *g_alloc_bio(void); void g_reset_bio(struct bio *); void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error); int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length); int g_delete_data(struct g_consumer *cp, off_t offset, off_t length); -void g_print_bio(struct bio *bp); +void g_format_bio(struct sbuf *, const struct bio *bp); +void g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, ...) __printflike(3, 4); int g_use_g_read_data(void *, off_t, void **, int); int g_use_g_write_data(void *, off_t, void *, int); /* geom_kern.c / geom_kernsim.c */ #ifdef _KERNEL extern struct sx topology_lock; struct g_kerneldump { off_t offset; off_t length; struct dumperinfo di; }; MALLOC_DECLARE(M_GEOM); static __inline void * g_malloc(int size, int flags) { void *p; p = malloc(size, M_GEOM, flags); return (p); } static __inline void g_free(void *ptr) { #ifdef DIAGNOSTIC if (sx_xlocked(&topology_lock)) { KASSERT(g_valid_obj(ptr) == 0, ("g_free(%p) of live object, type %d", ptr, g_valid_obj(ptr))); } #endif free(ptr, M_GEOM); } #define g_topology_lock() \ do { \ sx_xlock(&topology_lock); \ } while (0) #define g_topology_try_lock() sx_try_xlock(&topology_lock) #define g_topology_unlock() \ do { \ sx_xunlock(&topology_lock); \ } while (0) #define g_topology_assert() \ do { \ sx_assert(&topology_lock, SX_XLOCKED); \ } while (0) #define g_topology_assert_not() \ do { \ sx_assert(&topology_lock, SX_UNLOCKED); \ } while (0) #define g_topology_sleep(chan, timo) \ sx_sleep(chan, &topology_lock, 0, "gtopol", timo) #define DECLARE_GEOM_CLASS(class, name) \ static moduledata_t name##_mod = { \ #name, g_modevent, &class \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); int g_is_geom_thread(struct thread *td); #endif /* _KERNEL */ /* geom_ctl.c */ int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len); void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len); void *gctl_get_param(struct gctl_req *req, const char *param, int *len); char const *gctl_get_asciiparam(struct gctl_req *req, const char *param); void *gctl_get_paraml(struct gctl_req *req, const char *param, int len); int gctl_error(struct gctl_req *req, const char *fmt, ...) __printflike(2, 3); struct g_class *gctl_get_class(struct gctl_req *req, char const *arg); struct g_geom *gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg); struct g_provider *gctl_get_provider(struct gctl_req *req, char const *arg); #endif /* _GEOM_GEOM_H_ */ Index: head/sys/geom/geom_dbg.h =================================================================== --- head/sys/geom/geom_dbg.h (nonexistent) +++ head/sys/geom/geom_dbg.h (revision 350694) @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 Conrad Meyer + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#ifdef _KERNEL + +#define _GEOM_DEBUG(classname, ctrlvar, loglvl, biop, formatstr, ...) \ +do { \ + const int __control = (ctrlvar); \ + const int __level = (loglvl); \ + \ + if (__control < __level) \ + break; \ + \ + g_dbg_printf((classname), (__control > 0) ? __level : -1, \ + (biop), ": " formatstr, ## __VA_ARGS__); \ +} while (0) + +void g_dbg_printf(const char *classname, int lvl, struct bio *bp, + const char *format, ...) __printflike(4, 5); + +#endif /* _KERNEL */ Property changes on: head/sys/geom/geom_dbg.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 350693) +++ head/sys/geom/geom_io.c (revision 350694) @@ -1,1095 +1,1127 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int pace; static uma_zone_t biozone; /* * The head of the list of classifiers used in g_io_request. * Use g_register_classifier() and g_unregister_classifier() * to add/remove entries to the list. * Classifiers are invoked in registration order. */ static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; bp2->bio_classifier2 = bp->bio_classifier2; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } /* * bio classification support. * * g_register_classifier() and g_unregister_classifier() * are used to add/remove a classifier from the list. * The list is protected using the g_bio_run_down lock, * because the classifiers are called in this path. * * g_io_request() passes bio's that are not already classified * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). * Classifiers can store their result in the two fields * bio_classifier1 and bio_classifier2. * A classifier that updates one of the fields should * return a non-zero value. * If no classifier updates the field, g_run_classifiers() sets * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. */ int g_register_classifier(struct g_classifier_hook *hook) { g_bioq_lock(&g_bio_run_down); TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); g_bioq_unlock(&g_bio_run_down); return (0); } void g_unregister_classifier(struct g_classifier_hook *hook) { struct g_classifier_hook *entry; g_bioq_lock(&g_bio_run_down); TAILQ_FOREACH(entry, &g_classifier_tailq, link) { if (entry == hook) { TAILQ_REMOVE(&g_classifier_tailq, hook, link); break; } } g_bioq_unlock(&g_bio_run_down); } static void g_run_classifiers(struct bio *bp) { struct g_classifier_hook *hook; int classified = 0; biotrack(bp, __func__); TAILQ_FOREACH(hook, &g_classifier_tailq, link) classified |= hook->func(hook->arg, bp); if (!classified) bp->bio_classifier1 = BIO_NOTCLASSIFIED; } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { g_bioq_lock(&g_bio_run_down); g_run_classifiers(bp); g_bioq_unlock(&g_bio_run_down); } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } /* * A read function for use by ffs_sbget when used by GEOM-layer routines. */ int g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) { struct g_consumer *cp; KASSERT(*bufp == NULL, ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); cp = (struct g_consumer *)devfd; /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (loc % cp->provider->sectorsize != 0) return (ENOENT); *bufp = g_read_data(cp, loc, size, NULL); if (*bufp == NULL) return (ENOENT); return (0); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } /* * A write function for use by ffs_sbput when used by GEOM-layer routines. */ int g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) { return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void -g_print_bio(struct bio *bp) +g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, + ...) { +#ifndef PRINTF_BUFR_SIZE +#define PRINTF_BUFR_SIZE 64 +#endif + char bufr[PRINTF_BUFR_SIZE]; + struct sbuf sb, *sbp __unused; + va_list ap; + + sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); + KASSERT(sbp != NULL, ("sbuf_new misused?")); + + sbuf_set_drain(&sb, sbuf_printf_drain, NULL); + + sbuf_cat(&sb, prefix); + g_format_bio(&sb, bp); + + va_start(ap, fmtsuffix); + sbuf_vprintf(&sb, fmtsuffix, ap); + va_end(ap); + + sbuf_nl_terminate(&sb); + + sbuf_finish(&sb); + sbuf_delete(&sb); +} + +void +g_format_bio(struct sbuf *sb, const struct bio *bp) +{ const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; - printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); + sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd, + bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; - printf("%s[%s]", pname, cmd); + sbuf_printf(sb, "%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } - printf("%s[%s,%s]", pname, cmd, subcmd); + sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; - printf("%s[%s()]", pname, cmd); + sbuf_printf(sb, "%s[%s()]", pname, cmd); return; } - printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, + sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/geom/geom_subr.c =================================================================== --- head/sys/geom/geom_subr.c (revision 350693) +++ head/sys/geom/geom_subr.c (revision 350694) @@ -1,1613 +1,1652 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; struct g_hh00 { struct g_class *mp; struct g_provider *pp; off_t size; int error; int post; }; + +void +g_dbg_printf(const char *classname, int lvl, struct bio *bp, + const char *format, + ...) +{ +#ifndef PRINTF_BUFR_SIZE +#define PRINTF_BUFR_SIZE 64 +#endif + char bufr[PRINTF_BUFR_SIZE]; + struct sbuf sb, *sbp __unused; + va_list ap; + + sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); + KASSERT(sbp != NULL, ("sbuf_new misused?")); + + sbuf_set_drain(&sb, sbuf_printf_drain, NULL); + + sbuf_cat(&sb, classname); + if (lvl >= 0) + sbuf_printf(&sb, "[%d]", lvl); + + va_start(ap, format); + sbuf_vprintf(&sb, format, ap); + va_end(ap); + + if (bp != NULL) { + sbuf_putc(&sb, ' '); + g_format_bio(&sb, bp); + } + + /* Terminate the debug line with a single '\n'. */ + sbuf_nl_terminate(&sb); + + /* Flush line to printf. */ + sbuf_finish(&sb); + sbuf_delete(&sb); +} /* * This event offers a new class a chance to taste all preexisting providers. */ static void g_load_class(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp2, *mp; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); KASSERT(mp->name != NULL && *mp->name != '\0', ("GEOM class has no name")); LIST_FOREACH(mp2, &g_classes, class) { if (mp2 == mp) { printf("The GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } else if (strcmp(mp2->name, mp->name) == 0) { printf("A GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } } LIST_INIT(&mp->geom); LIST_INSERT_HEAD(&g_classes, mp, class); if (mp->init != NULL) mp->init(mp); if (mp->taste == NULL) return; LIST_FOREACH(mp2, &g_classes, class) { if (mp == mp2) continue; LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { mp->taste(mp, pp, 0); g_topology_assert(); } } } } static int g_unload_class(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; int error; g_topology_lock(); g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); retry: G_VALID_CLASS(mp); LIST_FOREACH(gp, &mp->geom, geom) { /* We refuse to unload if anything is open */ LIST_FOREACH(pp, &gp->provider, provider) if (pp->acr || pp->acw || pp->ace) { g_topology_unlock(); return (EBUSY); } LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) { g_topology_unlock(); return (EBUSY); } /* If the geom is withering, wait for it to finish. */ if (gp->flags & G_GEOM_WITHER) { g_topology_sleep(mp, 1); goto retry; } } /* * We allow unloading if we have no geoms, or a class * method we can use to get rid of them. */ if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) { g_topology_unlock(); return (EOPNOTSUPP); } /* Bar new entries */ mp->taste = NULL; mp->config = NULL; LIST_FOREACH(gp, &mp->geom, geom) { error = mp->destroy_geom(NULL, mp, gp); if (error != 0) { g_topology_unlock(); return (error); } } /* Wait for withering to finish. */ for (;;) { gp = LIST_FIRST(&mp->geom); if (gp == NULL) break; KASSERT(gp->flags & G_GEOM_WITHER, ("Non-withering geom in class %s", mp->name)); g_topology_sleep(mp, 1); } G_VALID_CLASS(mp); if (mp->fini != NULL) mp->fini(mp); LIST_REMOVE(mp, class); g_topology_unlock(); return (0); } int g_modevent(module_t mod, int type, void *data) { struct g_hh00 *hh; int error; static int g_ignition; struct g_class *mp; mp = data; if (mp->version != G_VERSION) { printf("GEOM class %s has Wrong version %x\n", mp->name, mp->version); return (EINVAL); } if (!g_ignition) { g_ignition++; g_init(); } error = EOPNOTSUPP; switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be * from the userland and the g_event thread will be able * to acknowledge their completion. */ if (cold) { hh->post = 1; error = g_post_event(g_load_class, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_load_class, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } break; case MOD_UNLOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name); error = g_unload_class(mp); if (error == 0) { KASSERT(LIST_EMPTY(&mp->geom), ("Unloaded class (%s) still has geom", mp->name)); } break; } return (error); } static void g_retaste_event(void *arg, int flag) { struct g_class *mp, *mp2; struct g_geom *gp; struct g_hh00 *hh; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown || g_notaste) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name); LIST_FOREACH(mp2, &g_classes, class) { LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr || pp->acw || pp->ace) continue; LIST_FOREACH(cp, &pp->consumers, consumers) { if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; } if (cp != NULL) { cp->flags |= G_CF_ORPHAN; g_wither_geom(cp->geom, ENXIO); } mp->taste(mp, pp, 0); g_topology_assert(); } } } } int g_retaste(struct g_class *mp) { struct g_hh00 *hh; int error; if (mp->taste == NULL) return (EINVAL); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { hh->post = 1; error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } return (error); } struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) { struct g_geom *gp; va_list ap; struct sbuf *sb; g_topology_assert(); G_VALID_CLASS(mp); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); LIST_INIT(&gp->provider); LIST_INIT(&gp->aliases); LIST_INSERT_HEAD(&mp->geom, gp, geom); TAILQ_INSERT_HEAD(&geoms, gp, geoms); strcpy(gp->name, sbuf_data(sb)); sbuf_delete(sb); /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; gp->attrchanged = mp->attrchanged; gp->providergone = mp->providergone; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; gp->ioctl = mp->ioctl; gp->resize = mp->resize; return (gp); } void g_destroy_geom(struct g_geom *gp) { struct g_geom_alias *gap, *gaptmp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); KASSERT(LIST_EMPTY(&gp->consumer), ("g_destroy_geom(%s) with consumer(s) [%p]", gp->name, LIST_FIRST(&gp->consumer))); KASSERT(LIST_EMPTY(&gp->provider), ("g_destroy_geom(%s) with provider(s) [%p]", gp->name, LIST_FIRST(&gp->provider))); g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); LIST_FOREACH_SAFE(gap, &gp->aliases, ga_next, gaptmp) g_free(gap); g_free(gp->name); g_free(gp); } /* * This function is called (repeatedly) until the geom has withered away. */ void g_wither_geom(struct g_geom *gp, int error) { struct g_provider *pp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); if (!(gp->flags & G_GEOM_WITHER)) { gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } g_do_wither(); } /* * Convenience function to destroy a particular provider. */ void g_wither_provider(struct g_provider *pp, int error) { pp->flags |= G_PF_WITHER; if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } /* * This function is called (repeatedly) until the has withered away. */ void g_wither_geom_close(struct g_geom *gp, int error) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name); LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_wither_geom(gp, error); } /* * This function is called (repeatedly) until we cant wash away more * withered bits at present. */ void g_wither_washer() { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp, *pp2; struct g_consumer *cp, *cp2; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (!(pp->flags & G_PF_WITHER)) continue; if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } if (!(gp->flags & G_GEOM_WITHER)) continue; LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) { if (cp->acr || cp->acw || cp->ace) continue; if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); } } } struct g_consumer * g_new_consumer(struct g_geom *gp) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(!(gp->flags & G_GEOM_WITHER), ("g_new_consumer on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); KASSERT(gp->orphan != NULL, ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->consumer, cp, consumer); return(cp); } void g_destroy_consumer(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); g_cancel_event(cp); gp = cp->geom; LIST_REMOVE(cp, consumer); devstat_remove_entry(cp->stat); g_free(cp); if (gp->flags & G_GEOM_WITHER) g_do_wither(); } static void g_new_provider_event(void *arg, int flag) { struct g_class *mp; struct g_provider *pp; struct g_consumer *cp, *next_cp; g_topology_assert(); if (flag == EV_CANCEL) return; if (g_shutdown) return; pp = arg; G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_new_provider_event but withered")); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if ((cp->flags & G_CF_ORPHAN) == 0 && cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, "GEOM::media"); } if (g_notaste) return; LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) { struct g_provider *pp; struct sbuf *sb; va_list ap; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(gp->access != NULL, ("new provider on geom(%s) without ->access (class %s)", gp->name, gp->class->name)); KASSERT(gp->start != NULL, ("new provider on geom(%s) without ->start (class %s)", gp->name, gp->class->name)); KASSERT(!(gp->flags & G_GEOM_WITHER), ("new provider on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); LIST_INIT(&pp->consumers); pp->error = ENXIO; pp->geom = gp; pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->provider, pp, provider); g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL); return (pp); } void g_error_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp); We may not have g_topology */ pp->error = error; } static void g_resize_provider_event(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; off_t size; g_topology_assert(); if (g_shutdown) return; hh = arg; pp = hh->pp; size = hh->size; g_free(hh); G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_resize_provider_event but withered")); g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize == NULL && size < pp->mediasize) { /* * XXX: g_dev_orphan method does deferred destroying * and it is possible, that other event could already * call the orphan method. Check consumer's flags to * do not schedule it twice. */ if (cp->flags & G_CF_ORPHAN) continue; cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } } pp->mediasize = size; LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL) gp->resize(cp); } /* * After resizing, the previously invalid GEOM class metadata * might become valid. This means we should retaste. */ LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } void g_resize_provider(struct g_provider *pp, off_t size) { struct g_hh00 *hh; G_VALID_PROVIDER(pp); if (pp->flags & G_PF_WITHER) return; if (size == pp->mediasize) return; hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); } #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif struct g_provider * g_provider_by_name(char const *arg) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp, *wpp; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) arg += sizeof(_PATH_DEV) - 1; wpp = NULL; LIST_FOREACH(cp, &g_classes, class) { LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(arg, pp->name) != 0) continue; if ((gp->flags & G_GEOM_WITHER) == 0 && (pp->flags & G_PF_WITHER) == 0) return (pp); else wpp = pp; } } } return (wpp); } void g_destroy_provider(struct g_provider *pp) { struct g_geom *gp; g_topology_assert(); G_VALID_PROVIDER(pp); KASSERT(LIST_EMPTY(&pp->consumers), ("g_destroy_provider but attached")); KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); KASSERT (pp->ace == 0, ("g_destroy_provider with ace")); g_cancel_event(pp); LIST_REMOVE(pp, provider); gp = pp->geom; devstat_remove_entry(pp->stat); /* * If a callback was provided, send notification that the provider * is now gone. */ if (gp->providergone != NULL) gp->providergone(pp); g_free(pp); if ((gp->flags & G_GEOM_WITHER)) g_do_wither(); } /* * We keep the "geoms" list sorted by topological order (== increasing * numerical rank) at all times. * When an attach is done, the attaching geoms rank is invalidated * and it is moved to the tail of the list. * All geoms later in the sequence has their ranks reevaluated in * sequence. If we cannot assign rank to a geom because it's * prerequisites do not have rank, we move that element to the tail * of the sequence with invalid rank as well. * At some point we encounter our original geom and if we stil fail * to assign it a rank, there must be a loop and we fail back to * g_attach() which detach again and calls redo_rank again * to fix up the damage. * It would be much simpler code wise to do it recursively, but we * can't risk that on the kernel stack. */ static int redo_rank(struct g_geom *gp) { struct g_consumer *cp; struct g_geom *gp1, *gp2; int n, m; g_topology_assert(); G_VALID_GEOM(gp); /* Invalidate this geoms rank and move it to the tail */ gp1 = TAILQ_NEXT(gp, geoms); if (gp1 != NULL) { gp->rank = 0; TAILQ_REMOVE(&geoms, gp, geoms); TAILQ_INSERT_TAIL(&geoms, gp, geoms); } else { gp1 = gp; } /* re-rank the rest of the sequence */ for (; gp1 != NULL; gp1 = gp2) { gp1->rank = 0; m = 1; LIST_FOREACH(cp, &gp1->consumer, consumer) { if (cp->provider == NULL) continue; n = cp->provider->geom->rank; if (n == 0) { m = 0; break; } else if (n >= m) m = n + 1; } gp1->rank = m; gp2 = TAILQ_NEXT(gp1, geoms); /* got a rank, moving on */ if (m != 0) continue; /* no rank to original geom means loop */ if (gp == gp1) return (ELOOP); /* no rank, put it at the end move on */ TAILQ_REMOVE(&geoms, gp1, geoms); TAILQ_INSERT_TAIL(&geoms, gp1, geoms); } return (0); } int g_attach(struct g_consumer *cp, struct g_provider *pp) { int error; g_topology_assert(); G_VALID_CONSUMER(cp); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp); KASSERT(cp->provider == NULL, ("attach but attached")); cp->provider = pp; cp->flags &= ~G_CF_ORPHAN; LIST_INSERT_HEAD(&pp->consumers, cp, consumers); error = redo_rank(cp->geom); if (error) { LIST_REMOVE(cp, consumers); cp->provider = NULL; redo_rank(cp->geom); } return (error); } void g_detach(struct g_consumer *cp) { struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); KASSERT(cp->provider != NULL, ("detach but not attached")); KASSERT(cp->acr == 0, ("detach but nonzero acr")); KASSERT(cp->acw == 0, ("detach but nonzero acw")); KASSERT(cp->ace == 0, ("detach but nonzero ace")); KASSERT(cp->nstart == cp->nend, ("detach with active requests")); pp = cp->provider; LIST_REMOVE(cp, consumers); cp->provider = NULL; if ((cp->geom->flags & G_GEOM_WITHER) || (pp->geom->flags & G_GEOM_WITHER) || (pp->flags & G_PF_WITHER)) g_do_wither(); redo_rank(cp->geom); } /* * g_access() * * Access-check with delta values. The question asked is "can provider * "cp" change the access counters by the relative amounts dc[rwe] ?" */ int g_access(struct g_consumer *cp, int dcr, int dcw, int dce) { struct g_provider *pp; struct g_geom *gp; int pw, pe; #ifdef INVARIANTS int sr, sw, se; #endif int error; g_topology_assert(); G_VALID_CONSUMER(cp); pp = cp->provider; KASSERT(pp != NULL, ("access but not attached")); G_VALID_PROVIDER(pp); gp = pp->geom; g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)", cp, pp->name, dcr, dcw, dce); KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request")); KASSERT(gp->access != NULL, ("NULL geom->access")); /* * If our class cares about being spoiled, and we have been, we * are probably just ahead of the event telling us that. Fail * now rather than having to unravel this later. */ if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) && (dcr > 0 || dcw > 0 || dce > 0)) return (ENXIO); /* * A number of GEOM classes either need to perform an I/O on the first * open or to acquire a different subsystem's lock. To do that they * may have to drop the topology lock. * Other GEOM classes perform special actions when opening a lower rank * geom for the first time. As a result, more than one thread may * end up performing the special actions. * So, we prevent concurrent "first" opens by marking the consumer with * special flag. * * Note that if the geom's access method never drops the topology lock, * then we will never see G_GEOM_IN_ACCESS here. */ while ((gp->flags & G_GEOM_IN_ACCESS) != 0) { g_trace(G_T_ACCESS, "%s: race on geom %s via provider %s and consumer of %s", __func__, gp->name, pp->name, cp->geom->name); gp->flags |= G_GEOM_ACCESS_WAIT; g_topology_sleep(gp, 0); } /* * Figure out what counts the provider would have had, if this * consumer had (r0w0e0) at this time. */ pw = pp->acw - cp->acw; pe = pp->ace - cp->ace; g_trace(G_T_ACCESS, "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", dcr, dcw, dce, cp->acr, cp->acw, cp->ace, pp->acr, pp->acw, pp->ace, pp, pp->name); /* If foot-shooting is enabled, any open on rank#1 is OK */ if ((g_debugflags & 16) && gp->rank == 1) ; /* If we try exclusive but already write: fail */ else if (dce > 0 && pw > 0) return (EPERM); /* If we try write but already exclusive: fail */ else if (dcw > 0 && pe > 0) return (EPERM); /* If we try to open more but provider is error'ed: fail */ else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) { printf("%s(%d): provider %s has error %d set\n", __func__, __LINE__, pp->name, pp->error); return (pp->error); } /* Ok then... */ #ifdef INVARIANTS sr = cp->acr; sw = cp->acw; se = cp->ace; #endif gp->flags |= G_GEOM_IN_ACCESS; error = gp->access(pp, dcr, dcw, dce); KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0, ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed " "closing ->access()", gp->class->name, pp->name, dcr, dcw, dce, error)); g_topology_assert(); gp->flags &= ~G_GEOM_IN_ACCESS; KASSERT(cp->acr == sr && cp->acw == sw && cp->ace == se, ("Access counts changed during geom->access")); if ((gp->flags & G_GEOM_ACCESS_WAIT) != 0) { gp->flags &= ~G_GEOM_ACCESS_WAIT; wakeup(gp); } if (!error) { /* * If we open first write, spoil any partner consumers. * If we close last write and provider is not errored, * trigger re-taste. */ if (pp->acw == 0 && dcw != 0) g_spoil(pp, cp); else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 && !(gp->flags & G_GEOM_WITHER)) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); pp->acr += dcr; pp->acw += dcw; pp->ace += dce; cp->acr += dcr; cp->acw += dcw; cp->ace += dce; if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) KASSERT(pp->sectorsize > 0, ("Provider %s lacks sectorsize", pp->name)); if ((cp->geom->flags & G_GEOM_WITHER) && cp->acr == 0 && cp->acw == 0 && cp->ace == 0) g_do_wither(); } return (error); } int g_handleattr_int(struct bio *bp, const char *attribute, int val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_str(struct bio *bp, const char *attribute, const char *str) { return (g_handleattr(bp, attribute, str, 0)); } int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len) { int error = 0; if (strcmp(bp->bio_attribute, attribute)) return (0); if (len == 0) { bzero(bp->bio_data, bp->bio_length); if (strlcpy(bp->bio_data, val, bp->bio_length) >= bp->bio_length) { printf("%s: %s %s bio_length %jd strlen %zu -> EFAULT\n", __func__, bp->bio_to->name, attribute, (intmax_t)bp->bio_length, strlen(val)); error = EFAULT; } } else if (bp->bio_length == len) { bcopy(val, bp->bio_data, len); } else { printf("%s: %s %s bio_length %jd len %d -> EFAULT\n", __func__, bp->bio_to->name, attribute, (intmax_t)bp->bio_length, len); error = EFAULT; } if (error == 0) bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); return (1); } int g_std_access(struct g_provider *pp, int dr __unused, int dw __unused, int de __unused) { g_topology_assert(); G_VALID_PROVIDER(pp); return (0); } void g_std_done(struct bio *bp) { struct bio *bp2; bp2 = bp->bio_parent; if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) g_io_deliver(bp2, bp2->bio_error); } /* XXX: maybe this is only g_slice_spoiled */ void g_std_spoiled(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); cp->flags |= G_CF_ORPHAN; g_detach(cp); gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_orphan_provider(pp, ENXIO); g_destroy_consumer(cp); if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); else gp->flags |= G_GEOM_WITHER; } /* * Spoiling happens when a provider is opened for writing, but consumers * which are configured by in-band data are attached (slicers for instance). * Since the write might potentially change the in-band data, such consumers * need to re-evaluate their existence after the writing session closes. * We do this by (offering to) tear them down when the open for write happens * in return for a re-taste when it closes again. * Together with the fact that such consumers grab an 'e' bit whenever they * are open, regardless of mode, this ends up DTRT. */ static void g_spoil_event(void *arg, int flag) { struct g_provider *pp; struct g_consumer *cp, *cp2; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp, pp->geom->class->name, pp->geom->name, pp->name); for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { cp2 = LIST_NEXT(cp, consumers); if ((cp->flags & G_CF_SPOILED) == 0) continue; cp->flags &= ~G_CF_SPOILED; if (cp->geom->spoiled == NULL) continue; cp->geom->spoiled(cp); g_topology_assert(); } } void g_spoil(struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; g_topology_assert(); G_VALID_PROVIDER(pp); G_VALID_CONSUMER(cp); LIST_FOREACH(cp2, &pp->consumers, consumers) { if (cp2 == cp) continue; /* KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); */ KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); cp2->flags |= G_CF_SPOILED; } g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); } static void g_media_changed_event(void *arg, int flag) { struct g_provider *pp; int retaste; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); /* * If provider was not open for writing, queue retaste after spoiling. * If it was, retaste will happen automatically on close. */ retaste = (pp->acw == 0 && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)); g_spoil_event(arg, flag); if (retaste) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); } int g_media_changed(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_media_changed_event, pp, flag, pp, NULL)); } int g_media_gone(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_spoil_event, pp, flag, pp, NULL)); } int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) { int error, i; i = len; error = g_io_getattr(attr, cp, &i, var); if (error) return (error); if (i != len) return (EINVAL); return (0); } static int g_get_device_prefix_len(const char *name) { int len; if (strncmp(name, "ada", 3) == 0) len = 3; else if (strncmp(name, "ad", 2) == 0) len = 2; else return (0); if (name[len] < '0' || name[len] > '9') return (0); do { len++; } while (name[len] >= '0' && name[len] <= '9'); return (len); } int g_compare_names(const char *namea, const char *nameb) { int deva, devb; if (strcmp(namea, nameb) == 0) return (1); deva = g_get_device_prefix_len(namea); if (deva == 0) return (0); devb = g_get_device_prefix_len(nameb); if (devb == 0) return (0); if (strcmp(namea + deva, nameb + devb) == 0) return (1); return (0); } void g_geom_add_alias(struct g_geom *gp, const char *alias) { struct g_geom_alias *gap; gap = (struct g_geom_alias *)g_malloc( sizeof(struct g_geom_alias) + strlen(alias) + 1, M_WAITOK); strcpy((char *)(gap + 1), alias); gap->ga_alias = (const char *)(gap + 1); LIST_INSERT_HEAD(&gp->aliases, gap, ga_next); } #if defined(DIAGNOSTIC) || defined(DDB) /* * This function walks the mesh and returns a non-zero integer if it * finds the argument pointer is an object. The return value indicates * which type of object it is believed to be. If topology is not locked, * this function is potentially dangerous, but we don't assert that the * topology lock is held when called from debugger. */ int g_valid_obj(void const *ptr) { struct g_class *mp; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; #ifdef KDB if (kdb_active == 0) #endif g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (ptr == mp) return (1); LIST_FOREACH(gp, &mp->geom, geom) { if (ptr == gp) return (2); LIST_FOREACH(cp, &gp->consumer, consumer) if (ptr == cp) return (3); LIST_FOREACH(pp, &gp->provider, provider) if (ptr == pp) return (4); } } return(0); } #endif #ifdef DDB #define gprintf(...) do { \ db_printf("%*s", indent, ""); \ db_printf(__VA_ARGS__); \ } while (0) #define gprintln(...) do { \ gprintf(__VA_ARGS__); \ db_printf("\n"); \ } while (0) #define ADDFLAG(obj, flag, sflag) do { \ if ((obj)->flags & (flag)) { \ if (comma) \ strlcat(str, ",", size); \ strlcat(str, (sflag), size); \ comma = 1; \ } \ } while (0) static char * provider_flags_to_string(struct g_provider *pp, char *str, size_t size) { int comma = 0; bzero(str, size); if (pp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER"); ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN"); return (str); } static char * geom_flags_to_string(struct g_geom *gp, char *str, size_t size) { int comma = 0; bzero(str, size); if (gp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER"); return (str); } static void db_show_geom_consumer(int indent, struct g_consumer *cp) { if (indent == 0) { gprintln("consumer: %p", cp); gprintln(" class: %s (%p)", cp->geom->class->name, cp->geom->class); gprintln(" geom: %s (%p)", cp->geom->name, cp->geom); if (cp->provider == NULL) gprintln(" provider: none"); else { gprintln(" provider: %s (%p)", cp->provider->name, cp->provider); } gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace); gprintln(" flags: 0x%04x", cp->flags); gprintln(" nstart: %u", cp->nstart); gprintln(" nend: %u", cp->nend); } else { gprintf("consumer: %p (%s), access=r%dw%de%d", cp, cp->provider != NULL ? cp->provider->name : "none", cp->acr, cp->acw, cp->ace); if (cp->flags) db_printf(", flags=0x%04x", cp->flags); db_printf("\n"); } } static void db_show_geom_provider(int indent, struct g_provider *pp) { struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("provider: %s (%p)", pp->name, pp); gprintln(" class: %s (%p)", pp->geom->class->name, pp->geom->class); gprintln(" geom: %s (%p)", pp->geom->name, pp->geom); gprintln(" mediasize: %jd", (intmax_t)pp->mediasize); gprintln(" sectorsize: %u", pp->sectorsize); gprintln(" stripesize: %ju", (uintmax_t)pp->stripesize); gprintln(" stripeoffset: %ju", (uintmax_t)pp->stripeoffset); gprintln(" access: r%dw%de%d", pp->acr, pp->acw, pp->ace); gprintln(" flags: %s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); gprintln(" error: %d", pp->error); gprintln(" nstart: %u", pp->nstart); gprintln(" nend: %u", pp->nend); if (LIST_EMPTY(&pp->consumers)) gprintln(" consumers: none"); } else { gprintf("provider: %s (%p), access=r%dw%de%d", pp->name, pp, pp->acr, pp->acw, pp->ace); if (pp->flags != 0) { db_printf(", flags=%s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&pp->consumers)) { LIST_FOREACH(cp, &pp->consumers, consumers) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_geom(int indent, struct g_geom *gp) { struct g_provider *pp; struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("geom: %s (%p)", gp->name, gp); gprintln(" class: %s (%p)", gp->class->name, gp->class); gprintln(" flags: %s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); gprintln(" rank: %d", gp->rank); if (LIST_EMPTY(&gp->provider)) gprintln(" providers: none"); if (LIST_EMPTY(&gp->consumer)) gprintln(" consumers: none"); } else { gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank); if (gp->flags != 0) { db_printf(", flags=%s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&gp->provider)) { LIST_FOREACH(pp, &gp->provider, provider) { db_show_geom_provider(indent + 2, pp); if (db_pager_quit) break; } } if (!LIST_EMPTY(&gp->consumer)) { LIST_FOREACH(cp, &gp->consumer, consumer) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_class(struct g_class *mp) { struct g_geom *gp; db_printf("class: %s (%p)\n", mp->name, mp); LIST_FOREACH(gp, &mp->geom, geom) { db_show_geom_geom(2, gp); if (db_pager_quit) break; } } /* * Print the GEOM topology or the given object. */ DB_SHOW_COMMAND(geom, db_show_geom) { struct g_class *mp; if (!have_addr) { /* No address given, print the entire topology. */ LIST_FOREACH(mp, &g_classes, class) { db_show_geom_class(mp); db_printf("\n"); if (db_pager_quit) break; } } else { switch (g_valid_obj((void *)addr)) { case 1: db_show_geom_class((struct g_class *)addr); break; case 2: db_show_geom_geom(0, (struct g_geom *)addr); break; case 3: db_show_geom_consumer(0, (struct g_consumer *)addr); break; case 4: db_show_geom_provider(0, (struct g_provider *)addr); break; default: db_printf("Not a GEOM object.\n"); break; } } } static void db_print_bio_cmd(struct bio *bp) { db_printf(" cmd: "); switch (bp->bio_cmd) { case BIO_READ: db_printf("BIO_READ"); break; case BIO_WRITE: db_printf("BIO_WRITE"); break; case BIO_DELETE: db_printf("BIO_DELETE"); break; case BIO_GETATTR: db_printf("BIO_GETATTR"); break; case BIO_FLUSH: db_printf("BIO_FLUSH"); break; case BIO_CMD0: db_printf("BIO_CMD0"); break; case BIO_CMD1: db_printf("BIO_CMD1"); break; case BIO_CMD2: db_printf("BIO_CMD2"); break; case BIO_ZONE: db_printf("BIO_ZONE"); break; default: db_printf("UNKNOWN"); break; } db_printf("\n"); } static void db_print_bio_flags(struct bio *bp) { int comma; comma = 0; db_printf(" flags: "); if (bp->bio_flags & BIO_ERROR) { db_printf("BIO_ERROR"); comma = 1; } if (bp->bio_flags & BIO_DONE) { db_printf("%sBIO_DONE", (comma ? ", " : "")); comma = 1; } if (bp->bio_flags & BIO_ONQUEUE) db_printf("%sBIO_ONQUEUE", (comma ? ", " : "")); db_printf("\n"); } /* * Print useful information in a BIO */ DB_SHOW_COMMAND(bio, db_show_bio) { struct bio *bp; if (have_addr) { bp = (struct bio *)addr; db_printf("BIO %p\n", bp); db_print_bio_cmd(bp); db_print_bio_flags(bp); db_printf(" cflags: 0x%hx\n", bp->bio_cflags); db_printf(" pflags: 0x%hx\n", bp->bio_pflags); db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset); db_printf(" length: %jd\n", (intmax_t)bp->bio_length); db_printf(" bcount: %ld\n", bp->bio_bcount); db_printf(" resid: %ld\n", bp->bio_resid); db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed); db_printf(" children: %u\n", bp->bio_children); db_printf(" inbed: %u\n", bp->bio_inbed); db_printf(" error: %d\n", bp->bio_error); db_printf(" parent: %p\n", bp->bio_parent); db_printf(" driver1: %p\n", bp->bio_driver1); db_printf(" driver2: %p\n", bp->bio_driver2); db_printf(" caller1: %p\n", bp->bio_caller1); db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) db_printf(" bio_track_bp: %p\n", bp->bio_track_bp); #endif } } #undef gprintf #undef gprintln #undef ADDFLAG #endif /* DDB */ Index: head/sys/geom/geom_vfs.c =================================================================== --- head/sys/geom/geom_vfs.c (revision 350693) +++ head/sys/geom/geom_vfs.c (revision 350694) @@ -1,290 +1,289 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include #include /* * subroutines for use by filesystems. * * XXX: should maybe live somewhere else ? */ #include struct g_vfs_softc { struct mtx sc_mtx; struct bufobj *sc_bo; int sc_active; int sc_orphaned; }; static struct buf_ops __g_vfs_bufops = { .bop_name = "GEOM_VFS", .bop_write = bufwrite, .bop_strategy = g_vfs_strategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush }; struct buf_ops *g_vfs_bufops = &__g_vfs_bufops; static g_orphan_t g_vfs_orphan; static struct g_class g_vfs_class = { .name = "VFS", .version = G_VERSION, .orphan = g_vfs_orphan, }; DECLARE_GEOM_CLASS(g_vfs_class, g_vfs); static void g_vfs_destroy(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); if (cp->geom->softc == NULL) g_wither_geom(cp->geom, ENXIO); } static void g_vfs_done(struct bio *bip) { struct g_consumer *cp; struct g_vfs_softc *sc; struct buf *bp; int destroy; struct mount *mp; struct vnode *vp; struct cdev *cdevp; /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ bp = bip->bio_caller2; vp = bp->b_vp; if (vp != NULL) { /* * If not a disk vnode, use its associated mount point * otherwise use the mountpoint associated with the disk. */ VI_LOCK(vp); if (vp->v_type != VCHR || (cdevp = vp->v_rdev) == NULL || cdevp->si_devsw == NULL || (cdevp->si_devsw->d_flags & D_DISK) == 0) mp = vp->v_mount; else mp = cdevp->si_mountpt; if (mp != NULL) { if (bp->b_iocmd == BIO_READ) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncreads++; else mp->mnt_stat.f_syncreads++; } else if (bp->b_iocmd == BIO_WRITE) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncwrites++; else mp->mnt_stat.f_syncwrites++; } } VI_UNLOCK(vp); } cp = bip->bio_from; sc = cp->geom->softc; - if (bip->bio_error) { - printf("g_vfs_done():"); - g_print_bio(bip); - printf("error = %d\n", bip->bio_error); - } + if (bip->bio_error) + g_print_bio("g_vfs_done():", bip, "error = %d", + bip->bio_error); bp->b_error = bip->bio_error; bp->b_ioflags = bip->bio_flags; if (bip->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bip->bio_completed; g_destroy_bio(bip); mtx_lock(&sc->sc_mtx); destroy = ((--sc->sc_active) == 0 && sc->sc_orphaned); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_vfs_destroy, cp, M_WAITOK, NULL); bufdone(bp); } void g_vfs_strategy(struct bufobj *bo, struct buf *bp) { struct g_vfs_softc *sc; struct g_consumer *cp; struct bio *bip; cp = bo->bo_private; sc = cp->geom->softc; /* * If the provider has orphaned us, just return ENXIO. */ mtx_lock(&sc->sc_mtx); if (sc->sc_orphaned) { mtx_unlock(&sc->sc_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } sc->sc_active++; mtx_unlock(&sc->sc_mtx); bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bdata2bio(bp, bip); if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } bip->bio_done = g_vfs_done; bip->bio_caller2 = bp; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) buf_track(bp, __func__); bip->bio_track_bp = bp; #endif g_io_request(bip, cp); } static void g_vfs_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; int destroy; g_topology_assert(); gp = cp->geom; g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name); sc = gp->softc; if (sc == NULL) return; mtx_lock(&sc->sc_mtx); sc->sc_orphaned = 1; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_vfs_destroy(cp, 0); /* * Do not destroy the geom. Filesystem will do that during unmount. */ } int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; struct g_vfs_softc *sc; struct bufobj *bo; int error; g_topology_assert(); *cpp = NULL; bo = &vp->v_bufobj; if (bo->bo_private != vp) return (EBUSY); pp = g_dev_getprovider(vp->v_rdev); if (pp == NULL) return (ENOENT); gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_vfs", NULL, MTX_DEF); sc->sc_bo = bo; gp->softc = sc; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, wr, wr); if (error) { g_wither_geom(gp, ENXIO); return (error); } vnode_create_vobject(vp, pp->mediasize, curthread); *cpp = cp; cp->private = vp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; bo->bo_ops = g_vfs_bufops; bo->bo_private = cp; bo->bo_bsize = pp->sectorsize; return (error); } void g_vfs_close(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; g_topology_assert(); gp = cp->geom; sc = gp->softc; bufobj_invalbuf(sc->sc_bo, V_SAVE, 0, 0); sc->sc_bo->bo_private = cp->private; gp->softc = NULL; mtx_destroy(&sc->sc_mtx); if (!sc->sc_orphaned || cp->provider == NULL) g_wither_geom_close(gp, ENXIO); g_free(sc); } Index: head/sys/geom/journal/g_journal.c =================================================================== --- head/sys/geom/journal/g_journal.c (revision 350693) +++ head/sys/geom/journal/g_journal.c (revision 350694) @@ -1,3013 +1,3014 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GJ_MEMDEBUG #include #include #endif #include #include #include +#include #include FEATURE(geom_journal, "GEOM journaling support"); /* * On-disk journal format: * * JH - Journal header * RH - Record header * * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * */ CTASSERT(sizeof(struct g_journal_header) <= 512); CTASSERT(sizeof(struct g_journal_record_header) <= 512); static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); static struct mtx g_journal_cache_mtx; MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); const struct g_journal_desc *g_journal_filesystems[] = { &g_journal_ufs, NULL }; SYSCTL_DECL(_kern_geom); int g_journal_debug = 0; static u_int g_journal_switch_time = 10; static u_int g_journal_force_switch = 70; static u_int g_journal_parallel_flushes = 16; static u_int g_journal_parallel_copies = 16; static u_int g_journal_accept_immediately = 64; static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; static u_int g_journal_do_optimize = 1; static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff"); SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, &g_journal_switch_time, 0, "Switch journals every N seconds"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, &g_journal_force_switch, 0, "Force switch when journal is N% full"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, &g_journal_parallel_flushes, 0, "Number of flush I/O requests to send in parallel"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, &g_journal_accept_immediately, 0, "Number of I/O requests accepted immediately"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, &g_journal_parallel_copies, 0, "Number of copy I/O requests to send in parallel"); static int g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) { u_int entries; int error; entries = g_journal_record_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); g_journal_record_entries = entries; return (0); } SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I", "Maximum number of entires in one journal record"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); static u_long g_journal_cache_used = 0; static u_long g_journal_cache_limit = 64 * 1024 * 1024; static u_int g_journal_cache_divisor = 2; static u_int g_journal_cache_switch = 90; static u_int g_journal_cache_misses = 0; static u_int g_journal_cache_alloc_failures = 0; static u_long g_journal_cache_low = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0, "GEOM_JOURNAL cache"); SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, &g_journal_cache_used, 0, "Number of allocated bytes"); static int g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) { u_long limit; int error; limit = g_journal_cache_limit; error = sysctl_handle_long(oidp, &limit, 0, req); if (error != 0 || req->newptr == NULL) return (error); g_journal_cache_limit = limit; g_journal_cache_low = (limit / 100) * g_journal_cache_switch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, CTLTYPE_ULONG | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I", "Maximum number of allocated bytes"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, &g_journal_cache_divisor, 0, "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); static int g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) { u_int cswitch; int error; cswitch = g_journal_cache_switch; error = sysctl_handle_int(oidp, &cswitch, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (cswitch > 100) return (EINVAL); g_journal_cache_switch = cswitch; g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I", "Force switch when we hit this percent of cache use"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, &g_journal_cache_misses, 0, "Number of cache misses"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); static u_long g_journal_stats_bytes_skipped = 0; static u_long g_journal_stats_combined_ios = 0; static u_long g_journal_stats_switches = 0; static u_long g_journal_stats_wait_for_copy = 0; static u_long g_journal_stats_journal_full = 0; static u_long g_journal_stats_low_mem = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0, "GEOM_JOURNAL statistics"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, &g_journal_stats_switches, 0, "Number of journal switches"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, &g_journal_stats_journal_full, 0, "Number of times journal was almost full."); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); static g_taste_t g_journal_taste; static g_ctl_req_t g_journal_config; static g_dumpconf_t g_journal_dumpconf; static g_init_t g_journal_init; static g_fini_t g_journal_fini; struct g_class g_journal_class = { .name = G_JOURNAL_CLASS_NAME, .version = G_VERSION, .taste = g_journal_taste, .ctlreq = g_journal_config, .dumpconf = g_journal_dumpconf, .init = g_journal_init, .fini = g_journal_fini }; static int g_journal_destroy(struct g_journal_softc *sc); static void g_journal_metadata_update(struct g_journal_softc *sc); static void g_journal_start_switcher(struct g_class *mp); static void g_journal_stop_switcher(void); static void g_journal_switch_wait(struct g_journal_softc *sc); #define GJ_SWITCHER_WORKING 0 #define GJ_SWITCHER_DIE 1 #define GJ_SWITCHER_DIED 2 static struct proc *g_journal_switcher_proc = NULL; static int g_journal_switcher_state = GJ_SWITCHER_WORKING; static int g_journal_switcher_wokenup = 0; static int g_journal_sync_requested = 0; #ifdef GJ_MEMDEBUG struct meminfo { size_t mi_size; struct stack mi_stack; }; #endif /* * We use our own malloc/realloc/free funtions, so we can collect statistics * and force journal switch when we're running out of cache. */ static void * gj_malloc(size_t size, int flags) { void *p; #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif mtx_lock(&g_journal_cache_mtx); if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && g_journal_cache_used + size > g_journal_cache_low) { GJ_DEBUG(1, "No cache, waking up the switcher."); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); } if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && g_journal_cache_used + size > g_journal_cache_limit) { mtx_unlock(&g_journal_cache_mtx); g_journal_cache_alloc_failures++; return (NULL); } g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); flags &= ~M_NOWAIT; #ifndef GJ_MEMDEBUG p = malloc(size, M_JOURNAL, flags | M_WAITOK); #else mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); p = (u_char *)mi + sizeof(*mi); mi->mi_size = size; stack_save(&mi->mi_stack); #endif return (p); } static void gj_free(void *p, size_t size) { #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif KASSERT(p != NULL, ("p=NULL")); KASSERT(size > 0, ("size=0")); mtx_lock(&g_journal_cache_mtx); KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); g_journal_cache_used -= size; mtx_unlock(&g_journal_cache_mtx); #ifdef GJ_MEMDEBUG mi = p = (void *)((u_char *)p - sizeof(*mi)); if (mi->mi_size != size) { printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, mi->mi_size); printf("GJOURNAL: Alloc backtrace:\n"); stack_print(&mi->mi_stack); printf("GJOURNAL: Free backtrace:\n"); kdb_backtrace(); } #endif free(p, M_JOURNAL); } static void * gj_realloc(void *p, size_t size, size_t oldsize) { void *np; #ifndef GJ_MEMDEBUG mtx_lock(&g_journal_cache_mtx); g_journal_cache_used -= oldsize; g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); np = realloc(p, size, M_JOURNAL, M_WAITOK); #else np = gj_malloc(size, M_WAITOK); bcopy(p, np, MIN(oldsize, size)); gj_free(p, oldsize); #endif return (np); } static void g_journal_check_overflow(struct g_journal_softc *sc) { off_t length, used; if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset && sc->sc_journal_offset < sc->sc_active.jj_offset)) { panic("Journal overflow " "(id = %u joffset=%jd active=%jd inactive=%jd)", (unsigned)sc->sc_id, (intmax_t)sc->sc_journal_offset, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset); } if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; used = sc->sc_journal_offset - sc->sc_active.jj_offset; } else { length = sc->sc_jend - sc->sc_active.jj_offset; length += sc->sc_inactive.jj_offset - sc->sc_jstart; if (sc->sc_journal_offset >= sc->sc_active.jj_offset) used = sc->sc_journal_offset - sc->sc_active.jj_offset; else { used = sc->sc_jend - sc->sc_active.jj_offset; used += sc->sc_journal_offset - sc->sc_jstart; } } /* Already woken up? */ if (g_journal_switcher_wokenup) return; /* * If the active journal takes more than g_journal_force_switch precent * of free journal space, we force journal switch. */ KASSERT(length > 0, ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", (intmax_t)length, (intmax_t)used, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset, (intmax_t)sc->sc_journal_offset)); if ((used * 100) / length > g_journal_force_switch) { g_journal_stats_journal_full++; GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", sc->sc_name, (used * 100) / length); mtx_lock(&g_journal_cache_mtx); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); mtx_unlock(&g_journal_cache_mtx); } } static void g_journal_orphan(struct g_consumer *cp) { struct g_journal_softc *sc; char name[256]; int error; g_topology_assert(); sc = cp->geom->softc; strlcpy(name, cp->provider->name, sizeof(name)); GJ_DEBUG(0, "Lost provider %s.", name); if (sc == NULL) return; error = g_journal_destroy(sc); if (error == 0) GJ_DEBUG(0, "Journal %s destroyed.", name); else { GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " "Destroy it manually after last close.", sc->sc_name, error); } } static int g_journal_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_journal_softc *sc; int dcr, dcw, dce; g_topology_assert(); GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } if (pp->acw == 0 && dcw > 0) { GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); sc->sc_flags &= ~GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); sc->sc_flags |= GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } */ return (0); } static void g_journal_header_encode(struct g_journal_header *hdr, u_char *data) { bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); data += sizeof(GJ_HEADER_MAGIC); le32enc(data, hdr->jh_journal_id); data += 4; le32enc(data, hdr->jh_journal_next_id); } static int g_journal_header_decode(const u_char *data, struct g_journal_header *hdr) { bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); data += sizeof(hdr->jh_magic); if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) return (EINVAL); hdr->jh_journal_id = le32dec(data); data += 4; hdr->jh_journal_next_id = le32dec(data); return (0); } static void g_journal_flush_cache(struct g_journal_softc *sc) { struct bintime bt; int error; if (sc->sc_bio_flush == 0) return; GJ_TIMER_START(1, &bt); if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { error = g_io_flush(sc->sc_jconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_jconsumer->provider->name, error); } if (sc->sc_bio_flush & GJ_FLUSH_DATA) { /* * TODO: This could be called in parallel with the * previous call. */ error = g_io_flush(sc->sc_dconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_dconsumer->provider->name, error); } GJ_TIMER_STOP(1, &bt, "Cache flush time"); } static int g_journal_write_header(struct g_journal_softc *sc) { struct g_journal_header hdr; struct g_consumer *cp; u_char *buf; int error; cp = sc->sc_jconsumer; buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); hdr.jh_journal_id = sc->sc_journal_id; hdr.jh_journal_next_id = sc->sc_journal_next_id; g_journal_header_encode(&hdr, buf); error = g_write_data(cp, sc->sc_journal_offset, buf, cp->provider->sectorsize); /* if (error == 0) */ sc->sc_journal_offset += cp->provider->sectorsize; gj_free(buf, cp->provider->sectorsize); return (error); } /* * Every journal record has a header and data following it. * Functions below are used to decode the header before storing it to * little endian and to encode it after reading to system endianness. */ static void g_journal_record_header_encode(struct g_journal_record_header *hdr, u_char *data) { struct g_journal_entry *ent; u_int i; bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); data += sizeof(GJ_RECORD_HEADER_MAGIC); le32enc(data, hdr->jrh_journal_id); data += 8; le16enc(data, hdr->jrh_nentries); data += 2; bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; le64enc(data, ent->je_joffset); data += 8; le64enc(data, ent->je_offset); data += 8; le64enc(data, ent->je_length); data += 8; } } static int g_journal_record_header_decode(const u_char *data, struct g_journal_record_header *hdr) { struct g_journal_entry *ent; u_int i; bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); data += sizeof(hdr->jrh_magic); if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) return (EINVAL); hdr->jrh_journal_id = le32dec(data); data += 8; hdr->jrh_nentries = le16dec(data); data += 2; if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; ent->je_joffset = le64dec(data); data += 8; ent->je_offset = le64dec(data); data += 8; ent->je_length = le64dec(data); data += 8; } return (0); } /* * Function reads metadata from a provider (via the given consumer), decodes * it to system endianness and verifies its correctness. */ static int g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata is stored in last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = journal_metadata_decode(buf, md); g_free(buf); /* Is this is gjournal provider at all? */ if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) return (EINVAL); /* * Are we able to handle this version of metadata? * We only maintain backward compatibility. */ if (md->md_version > G_JOURNAL_VERSION) { GJ_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } /* Is checksum correct? */ if (error != 0) { GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } /* * Two functions below are responsible for updating metadata. * Only metadata on the data provider is updated (we need to update * information about active journal in there). */ static void g_journal_metadata_done(struct bio *bp) { /* * There is not much we can do on error except informing about it. */ if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", bp->bio_error); } else { GJ_LOGREQ(2, bp, "Metadata updated."); } gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(bp); } static void g_journal_metadata_update(struct g_journal_softc *sc) { struct g_journal_metadata md; struct g_consumer *cp; struct bio *bp; u_char *sector; cp = sc->sc_dconsumer; sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); md.md_version = G_JOURNAL_VERSION; md.md_id = sc->sc_id; md.md_type = sc->sc_orig_type; md.md_jstart = sc->sc_jstart; md.md_jend = sc->sc_jend; md.md_joffset = sc->sc_inactive.jj_offset; md.md_jid = sc->sc_journal_previous_id; md.md_flags = 0; if (sc->sc_flags & GJF_DEVICE_CLEAN) md.md_flags |= GJ_FLAG_CLEAN; if (sc->sc_flags & GJF_DEVICE_HARDCODED) strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = cp->provider->mediasize; journal_metadata_encode(&md, sector); /* * Flush the cache, so we know all data are on disk. * We write here informations like "journal is consistent", so we need * to be sure it is. Without BIO_FLUSH here, we can end up in situation * where metadata is stored on disk, but not all data. */ g_journal_flush_cache(sc); bp = g_alloc_bio(); bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; bp->bio_length = cp->provider->sectorsize; bp->bio_data = sector; bp->bio_cmd = BIO_WRITE; if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { bp->bio_done = g_journal_metadata_done; g_io_request(bp, cp); } else { bp->bio_done = NULL; g_io_request(bp, cp); biowait(bp, "gjmdu"); g_journal_metadata_done(bp); } /* * Be sure metadata reached the disk. */ g_journal_flush_cache(sc); } /* * This is where the I/O request comes from the GEOM. */ static void g_journal_start(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_to->geom->softc; GJ_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_regular_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); return; case BIO_GETATTR: if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); bp->bio_completed = strlen(bp->bio_to->name) + 1; g_io_deliver(bp, 0); return; } /* FALLTHROUGH */ case BIO_DELETE: default: g_io_deliver(bp, EOPNOTSUPP); return; } } static void g_journal_std_done(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_back_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); } static struct bio * g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, int flags) { struct bio *bp; bp = g_alloc_bio(); bp->bio_offset = start; bp->bio_joffset = joffset; bp->bio_length = end - start; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; if (data == NULL) bp->bio_data = NULL; else { bp->bio_data = gj_malloc(bp->bio_length, flags); if (bp->bio_data != NULL) bcopy(data, bp->bio_data, bp->bio_length); } return (bp); } #define g_journal_insert_bio(head, bp, flags) \ g_journal_insert((head), (bp)->bio_offset, \ (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ (bp)->bio_data, flags) /* * The function below does a lot more than just inserting bio to the queue. * It keeps the queue sorted by offset and ensures that there are no doubled * data (it combines bios where ranges overlap). * * The function returns the number of bios inserted (as bio can be splitted). */ static int g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, u_char *data, int flags) { struct bio *nbp, *cbp, *pbp; off_t cstart, cend; u_char *tmpdata; int n; GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, joffset); n = 0; pbp = NULL; GJQ_FOREACH(*head, cbp) { cstart = cbp->bio_offset; cend = cbp->bio_offset + cbp->bio_length; if (nstart >= cend) { /* * +-------------+ * | | * | current | +-------------+ * | bio | | | * | | | new | * +-------------+ | bio | * | | * +-------------+ */ GJ_DEBUG(3, "INSERT(%p): 1", *head); } else if (nend <= cstart) { /* * +-------------+ * | | * +-------------+ | current | * | | | bio | * | new | | | * | bio | +-------------+ * | | * +-------------+ */ nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; n++; GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, pbp); goto end; } else if (nstart <= cstart && nend >= cend) { /* * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+---+ +-------------+---+ * | | | | | | | * | | | | | | | * | +-------------+ | +-------------+ | * | new bio | | new bio | * +---------------------+ +-----------------+ * * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+ +-------------+ * | | | | | * | | | | | * | +-------------+ +-------------+ * | new bio | | new bio | * +-----------------+ +-------------+ */ g_journal_stats_bytes_skipped += cbp->bio_length; cbp->bio_offset = nstart; cbp->bio_joffset = joffset; cbp->bio_length = cend - nstart; if (cbp->bio_data != NULL) { gj_free(cbp->bio_data, cend - cstart); cbp->bio_data = NULL; } if (data != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(data, cbp->bio_data, cbp->bio_length); } data += cend - nstart; } joffset += cend - nstart; nstart = cend; GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend >= cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * | +-------------+ | +---------+---+ * | | | | | | | * | | | | | | | * +---+-------------+ +---+---------+ | * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += cend - nstart; nbp = g_journal_new_bio(nstart, cend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } if (data != NULL) data += cend - nstart; joffset += cend - nstart; nstart = cend; n++; GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend < cend) { /* * +---------------------+ * | current bio | * | +-------------+ | * | | | | * | | | | * +---+-------------+---+ * | new bio | * +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; if (cbp->bio_data == NULL) tmpdata = NULL; else tmpdata = cbp->bio_data + nend - cstart; nbp = g_journal_new_bio(nend, cend, cbp->bio_joffset + nend - cstart, tmpdata, flags); nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; ((struct bio *)cbp->bio_next)->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } n += 2; GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); goto end; } else if (nstart <= cstart && nend < cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * +-------------+ | +---+---------+ | * | | | | | | | * | | | | | | | * +-------------+---+ | +---------+---+ * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; cbp->bio_offset = nend; cbp->bio_length = cend - nend; cbp->bio_joffset += nend - cstart; tmpdata = cbp->bio_data; if (tmpdata != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(tmpdata + nend - cstart, cbp->bio_data, cbp->bio_length); } gj_free(tmpdata, cend - cstart); } n++; GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); goto end; } if (nstart == nend) goto end; pbp = cbp; } nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = NULL; n++; GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); end: if (g_journal_debug >= 3) { GJQ_FOREACH(*head, cbp) { GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, (intmax_t)cbp->bio_offset, (intmax_t)cbp->bio_length, (intmax_t)cbp->bio_joffset, cbp->bio_data); } GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); } return (n); } /* * The function combines neighbour bios trying to squeeze as much data as * possible into one bio. * * The function returns the number of bios combined (negative value). */ static int g_journal_optimize(struct bio *head) { struct bio *cbp, *pbp; int n; n = 0; pbp = NULL; GJQ_FOREACH(head, cbp) { /* Skip bios which has to be read first. */ if (cbp->bio_data == NULL) { pbp = NULL; continue; } /* There is no previous bio yet. */ if (pbp == NULL) { pbp = cbp; continue; } /* Is this a neighbour bio? */ if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { /* Be sure that bios queue is sorted. */ KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, ("poffset=%jd plength=%jd coffset=%jd", (intmax_t)pbp->bio_offset, (intmax_t)pbp->bio_length, (intmax_t)cbp->bio_offset)); pbp = cbp; continue; } /* Be sure we don't end up with too big bio. */ if (pbp->bio_length + cbp->bio_length > MAXPHYS) { pbp = cbp; continue; } /* Ok, we can join bios. */ GJ_LOGREQ(4, pbp, "Join: "); GJ_LOGREQ(4, cbp, "and: "); pbp->bio_data = gj_realloc(pbp->bio_data, pbp->bio_length + cbp->bio_length, pbp->bio_length); bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, cbp->bio_length); gj_free(cbp->bio_data, cbp->bio_length); pbp->bio_length += cbp->bio_length; pbp->bio_next = cbp->bio_next; g_destroy_bio(cbp); cbp = pbp; g_journal_stats_combined_ios++; n--; GJ_LOGREQ(4, pbp, "Got: "); } return (n); } /* * TODO: Update comment. * These are functions responsible for copying one portion of data from journal * to the destination provider. * The order goes like this: * 1. Read the header, which contains informations about data blocks * following it. * 2. Read the data blocks from the journal. * 3. Write the data blocks on the data provider. * * g_journal_copy_start() * g_journal_copy_done() - got finished write request, logs potential errors. */ /* * When there is no data in cache, this function is used to read it. */ static void g_journal_read_first(struct g_journal_softc *sc, struct bio *bp) { struct bio *cbp; /* * We were short in memory, so data was freed. * In that case we need to read it back from journal. */ cbp = g_alloc_bio(); cbp->bio_cflags = bp->bio_cflags; cbp->bio_parent = bp; cbp->bio_offset = bp->bio_joffset; cbp->bio_length = bp->bio_length; cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); cbp->bio_cmd = BIO_READ; cbp->bio_done = g_journal_std_done; GJ_LOGREQ(4, cbp, "READ FIRST"); g_io_request(cbp, sc->sc_jconsumer); g_journal_cache_misses++; } static void g_journal_copy_send(struct g_journal_softc *sc) { struct bio *bioq, *bp, *lbp; bioq = lbp = NULL; mtx_lock(&sc->sc_mtx); for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { bp = GJQ_FIRST(sc->sc_inactive.jj_queue); if (bp == NULL) break; GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); sc->sc_copy_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } mtx_unlock(&sc->sc_mtx); if (g_journal_do_optimize) sc->sc_copy_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); bp->bio_cflags = GJ_BIO_COPY; if (bp->bio_data == NULL) g_journal_read_first(sc, bp); else { bp->bio_joffset = 0; GJ_LOGREQ(4, bp, "SEND"); g_io_request(bp, sc->sc_dconsumer); } } } static void g_journal_copy_start(struct g_journal_softc *sc) { /* * Remember in metadata that we're starting to copy journaled data * to the data provider. * In case of power failure, we will copy these data once again on boot. */ if (!sc->sc_journal_copying) { sc->sc_journal_copying = 1; GJ_DEBUG(1, "Starting copy of journal."); g_journal_metadata_update(sc); } g_journal_copy_send(sc); } /* * Data block has been read from the journal provider. */ static int g_journal_copy_read_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; pbp = bp->bio_parent; if (bp->bio_error != 0) { GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); /* * We will not be able to deliver WRITE request as well. */ gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(pbp); g_destroy_bio(bp); sc->sc_copy_in_progress--; return (1); } pbp->bio_data = bp->bio_data; cp = sc->sc_dconsumer; g_io_request(pbp, cp); GJ_LOGREQ(4, bp, "READ DONE"); g_destroy_bio(bp); return (0); } /* * Data block has been written to the data provider. */ static void g_journal_copy_write_done(struct bio *bp) { struct g_journal_softc *sc; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; sc->sc_copy_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", bp->bio_error); } GJQ_REMOVE(sc->sc_copy_queue, bp); gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); if (sc->sc_copy_in_progress == 0) { /* * This was the last write request for this journal. */ GJ_DEBUG(1, "Data has been copied."); sc->sc_journal_copying = 0; } } static void g_journal_flush_done(struct bio *bp); /* * Flush one record onto active journal provider. */ static void g_journal_flush(struct g_journal_softc *sc) { struct g_journal_record_header hdr; struct g_journal_entry *ent; struct g_provider *pp; struct bio **bioq; struct bio *bp, *fbp, *pbp; off_t joffset; u_char *data, hash[16]; MD5_CTX ctx; u_int i; if (sc->sc_current_count == 0) return; pp = sc->sc_jprovider; GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); joffset = sc->sc_journal_offset; GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", sc->sc_current_count, pp->name, (intmax_t)joffset); /* * Store 'journal id', so we know to which journal this record belongs. */ hdr.jrh_journal_id = sc->sc_journal_id; /* Could be less than g_journal_record_entries if called due timeout. */ hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); bioq = &sc->sc_active.jj_queue; GJQ_LAST(sc->sc_flush_queue, pbp); fbp = g_alloc_bio(); fbp->bio_parent = NULL; fbp->bio_cflags = GJ_BIO_JOURNAL; fbp->bio_offset = -1; fbp->bio_joffset = joffset; fbp->bio_length = pp->sectorsize; fbp->bio_cmd = BIO_WRITE; fbp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); pbp = fbp; fbp->bio_to = pp; GJ_LOGREQ(4, fbp, "FLUSH_OUT"); joffset += pp->sectorsize; sc->sc_flush_count++; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < hdr.jrh_nentries; i++) { bp = sc->sc_current_queue; KASSERT(bp != NULL, ("NULL bp")); bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSHED"); sc->sc_current_queue = bp->bio_next; bp->bio_next = NULL; sc->sc_current_count--; /* Add to the header. */ ent = &hdr.jrh_entries[i]; ent->je_offset = bp->bio_offset; ent->je_joffset = joffset; ent->je_length = bp->bio_length; data = bp->bio_data; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Update(&ctx, data, ent->je_length); g_reset_bio(bp); bp->bio_cflags = GJ_BIO_JOURNAL; bp->bio_offset = ent->je_offset; bp->bio_joffset = ent->je_joffset; bp->bio_length = ent->je_length; bp->bio_data = data; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); pbp = bp; bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSH_OUT"); joffset += bp->bio_length; sc->sc_flush_count++; /* * Add request to the active sc_journal_queue queue. * This is our cache. After journal switch we don't have to * read the data from the inactive journal, because we keep * it in memory. */ g_journal_insert(bioq, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, data, M_NOWAIT); } /* * After all requests, store valid header. */ data = gj_malloc(pp->sectorsize, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(hash, &ctx); bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); } g_journal_record_header_encode(&hdr, data); fbp->bio_data = data; sc->sc_journal_offset = joffset; g_journal_check_overflow(sc); } /* * Flush request finished. */ static void g_journal_flush_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); cp = bp->bio_from; sc = cp->geom->softc; sc->sc_flush_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", bp->bio_error); } gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); } static void g_journal_release_delayed(struct g_journal_softc *sc); static void g_journal_flush_send(struct g_journal_softc *sc) { struct g_consumer *cp; struct bio *bioq, *bp, *lbp; cp = sc->sc_jconsumer; bioq = lbp = NULL; while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { /* Send one flush requests to the active journal. */ bp = GJQ_FIRST(sc->sc_flush_queue); if (bp != NULL) { GJQ_REMOVE(sc->sc_flush_queue, bp); sc->sc_flush_count--; bp->bio_offset = bp->bio_joffset; bp->bio_joffset = 0; sc->sc_flush_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } /* Try to release delayed requests. */ g_journal_release_delayed(sc); /* If there are no requests to flush, leave. */ if (GJQ_FIRST(sc->sc_flush_queue) == NULL) break; } if (g_journal_do_optimize) sc->sc_flush_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJ_LOGREQ(3, bp, "Flush request send"); g_io_request(bp, cp); } } static void g_journal_add_current(struct g_journal_softc *sc, struct bio *bp) { int n; GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); sc->sc_current_count += n; n = g_journal_optimize(sc->sc_current_queue); sc->sc_current_count += n; /* * For requests which are added to the current queue we deliver * response immediately. */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); if (sc->sc_current_count >= g_journal_record_entries) { /* * Let's flush one record onto active journal provider. */ g_journal_flush(sc); } } static void g_journal_release_delayed(struct g_journal_softc *sc) { struct bio *bp; for (;;) { /* The flush queue is full, exit. */ if (sc->sc_flush_count >= g_journal_accept_immediately) return; bp = bioq_takefirst(&sc->sc_delayed_queue); if (bp == NULL) return; sc->sc_delayed_count--; g_journal_add_current(sc, bp); } } /* * Add I/O request to the current queue. If we have enough requests for one * journal record we flush them onto active journal provider. */ static void g_journal_add_request(struct g_journal_softc *sc, struct bio *bp) { /* * The flush queue is full, we need to delay the request. */ if (sc->sc_delayed_count > 0 || sc->sc_flush_count >= g_journal_accept_immediately) { GJ_LOGREQ(4, bp, "DELAYED"); bioq_insert_tail(&sc->sc_delayed_queue, bp); sc->sc_delayed_count++; return; } KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), ("DELAYED queue not empty.")); g_journal_add_current(sc, bp); } static void g_journal_read_done(struct bio *bp); /* * Try to find requested data in cache. */ static struct bio * g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; GJQ_FOREACH(head, bp) { if (bp->bio_offset == -1) continue; cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) { if (!sorted) continue; else { bp = NULL; break; } } if (bp->bio_data == NULL) break; GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * This function is used for collecting data on read. * The complexity is because parts of the data can be stored in four different * places: * - in memory - the data not yet send to the active journal provider * - in the active journal * - in the inactive journal * - in the data provider */ static void g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, off_t oend) { struct bio *bp, *nbp, *head; off_t cstart, cend; u_int i, sorted = 0; GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); cstart = cend = -1; bp = NULL; head = NULL; for (i = 1; i <= 5; i++) { switch (i) { case 1: /* Not-yet-send data. */ head = sc->sc_current_queue; sorted = 1; break; case 2: /* Skip flush queue as they are also in active queue */ continue; case 3: /* Active journal. */ head = sc->sc_active.jj_queue; sorted = 1; break; case 4: /* Inactive journal. */ /* * XXX: Here could be a race with g_journal_lowmem(). */ head = sc->sc_inactive.jj_queue; sorted = 1; break; case 5: /* In-flight to the data provider. */ head = sc->sc_copy_queue; sorted = 0; break; default: panic("gjournal %s: i=%d", __func__, i); } bp = g_journal_read_find(head, sorted, pbp, ostart, oend); if (bp == pbp) { /* Got the whole request. */ GJ_DEBUG(2, "Got the whole request from %u.", i); return; } else if (bp != NULL) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", i, (intmax_t)cstart, (intmax_t)cend); break; } } if (bp != NULL) { if (bp->bio_data == NULL) { nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + cstart - pbp->bio_offset; nbp->bio_offset = bp->bio_joffset + cstart - bp->bio_offset; nbp->bio_length = cend - cstart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_jconsumer); } /* * If we don't have the whole request yet, call g_journal_read() * recursively. */ if (ostart < cstart) g_journal_read(sc, pbp, ostart, cstart); if (oend > cend) g_journal_read(sc, pbp, cend, oend); } else { /* * No data in memory, no data in journal. * Its time for asking data provider. */ GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; nbp->bio_offset = ostart; nbp->bio_length = oend - ostart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_dconsumer); /* We have the whole request, return here. */ return; } } /* * Function responsible for handling finished READ requests. * Actually, g_std_done() could be used here, the only difference is that we * log error. */ static void g_journal_read_done(struct bio *bp) { struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_READ, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); pbp = bp->bio_parent; pbp->bio_inbed++; pbp->bio_completed += bp->bio_length; if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); } g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed && pbp->bio_completed == pbp->bio_length) { /* We're done. */ g_io_deliver(pbp, 0); } } /* * Deactive current journal and active next one. */ static void g_journal_switch(struct g_journal_softc *sc) { struct g_provider *pp; if (JEMPTY(sc)) { GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); pp = LIST_FIRST(&sc->sc_geom->provider); if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { sc->sc_flags |= GJF_DEVICE_CLEAN; GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); } } else { GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); pp = sc->sc_jprovider; sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_id = sc->sc_journal_next_id; sc->sc_journal_next_id = arc4random(); GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); g_journal_write_header(sc); sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; sc->sc_active.jj_offset = sc->sc_journal_offset - pp->sectorsize; sc->sc_active.jj_queue = NULL; /* * Switch is done, start copying data from the (now) inactive * journal to the data provider. */ g_journal_copy_start(sc); } mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_SWITCH; mtx_unlock(&sc->sc_mtx); } static void g_journal_initialize(struct g_journal_softc *sc) { sc->sc_journal_id = arc4random(); sc->sc_journal_next_id = arc4random(); sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_offset = sc->sc_jstart; sc->sc_inactive.jj_offset = sc->sc_jstart; g_journal_write_header(sc); sc->sc_active.jj_offset = sc->sc_jstart; } static void g_journal_mark_as_dirty(struct g_journal_softc *sc) { const struct g_journal_desc *desc; int i; GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) desc->jd_dirty(sc->sc_dconsumer); } /* * Function read record header from the given journal. * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio * and data on every call. */ static int g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, void *data) { int error; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = cp->provider->sectorsize; bp->bio_data = data; g_io_request(bp, cp); error = biowait(bp, "gjs_read"); return (error); } #if 0 /* * Function is called when we start the journal device and we detect that * one of the journals was not fully copied. * The purpose of this function is to read all records headers from journal * and placed them in the inactive queue, so we can start journal * synchronization process and the journal provider itself. * Design decision was taken to not synchronize the whole journal here as it * can take too much time. Reading headers only and delaying synchronization * process until after journal provider is started should be the best choice. */ #endif static void g_journal_sync(struct g_journal_softc *sc) { struct g_journal_record_header rhdr; struct g_journal_entry *ent; struct g_journal_header jhdr; struct g_consumer *cp; struct bio *bp, *fbp, *tbp; off_t joffset, offset; u_char *buf, sum[16]; uint64_t id; MD5_CTX ctx; int error, found, i; found = 0; fbp = NULL; cp = sc->sc_jconsumer; bp = g_alloc_bio(); buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); /* * Read and decode first journal header. */ error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { GJ_DEBUG(0, "Error while reading journal header from %s.", cp->provider->name); goto end; } error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(0, "Cannot decode journal header from %s.", cp->provider->name); goto end; } id = sc->sc_journal_id; if (jhdr.jh_journal_id != sc->sc_journal_id) { GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); goto end; } offset += cp->provider->sectorsize; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; for (;;) { /* * If the biggest record won't fit, look for a record header or * journal header from the beginning. */ GJ_VALIDATE_OFFSET(offset, sc); error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { /* * Not good. Having an error while reading header * means, that we cannot read next headers and in * consequence we cannot find termination. */ GJ_DEBUG(0, "Error while reading record header from %s.", cp->provider->name); break; } error = g_journal_record_header_decode(buf, &rhdr); if (error != 0) { GJ_DEBUG(2, "Not a record header at %jd (error=%d).", (intmax_t)offset, error); /* * This is not a record header. * If we are lucky, this is next journal header. */ error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", (intmax_t)offset, error); /* * Nope, this is not journal header, which * bascially means that journal is not * terminated properly. */ error = ENOENT; break; } /* * Ok. This is header of _some_ journal. Now we need to * verify if this is header of the _next_ journal. */ if (jhdr.jh_journal_id != id) { GJ_DEBUG(1, "Journal ID mismatch at %jd " "(0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); error = ENOENT; break; } /* Found termination. */ found++; GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", (intmax_t)offset, (u_int)id); sc->sc_active.jj_offset = offset; sc->sc_journal_offset = offset + cp->provider->sectorsize; sc->sc_journal_id = id; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; GJ_LOGREQ(3, tbp, "Adding request."); g_journal_insert_bio(&sc->sc_inactive.jj_queue, tbp, M_WAITOK); } /* Skip journal's header. */ offset += cp->provider->sectorsize; continue; } /* Skip record's header. */ offset += cp->provider->sectorsize; /* * Add information about every record entry to the inactive * queue. */ if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < rhdr.jrh_nentries; i++) { ent = &rhdr.jrh_entries[i]; GJ_DEBUG(3, "Insert entry: %jd %jd.", (intmax_t)ent->je_offset, (intmax_t)ent->je_length); g_journal_insert(&fbp, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, NULL, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { u_char *buf2; /* * TODO: Should use faster function (like * g_journal_sync_read()). */ buf2 = g_read_data(cp, offset, ent->je_length, NULL); if (buf2 == NULL) GJ_DEBUG(0, "Cannot read data at %jd.", (intmax_t)offset); else { MD5Update(&ctx, buf2, ent->je_length); g_free(buf2); } } /* Skip entry's data. */ offset += ent->je_length; } if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(sum, &ctx); if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { GJ_DEBUG(0, "MD5 hash mismatch at %jd!", (intmax_t)offset); } } } end: gj_free(bp->bio_data, cp->provider->sectorsize); g_destroy_bio(bp); /* Remove bios from unterminated journal. */ while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; g_destroy_bio(tbp); } if (found < 1 && joffset > 0) { GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", sc->sc_name); while ((tbp = sc->sc_inactive.jj_queue) != NULL) { sc->sc_inactive.jj_queue = tbp->bio_next; g_destroy_bio(tbp); } g_journal_initialize(sc); g_journal_mark_as_dirty(sc); } else { GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); g_journal_copy_start(sc); } } /* * Wait for requests. * If we have requests in the current queue, flush them after 3 seconds from the * last flush. In this way we don't wait forever (or for journal switch) with * storing not full records on journal. */ static void g_journal_wait(struct g_journal_softc *sc, time_t last_write) { int error, timeout; GJ_DEBUG(3, "%s: enter", __func__); if (sc->sc_current_count == 0) { if (g_journal_debug < 2) msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); else { /* * If we have debug turned on, show number of elements * in various queues. */ for (;;) { error = msleep(sc, &sc->sc_mtx, PRIBIO, "gj:work", hz * 3); if (error == 0) { mtx_unlock(&sc->sc_mtx); break; } GJ_DEBUG(3, "Report: current count=%d", sc->sc_current_count); GJ_DEBUG(3, "Report: flush count=%d", sc->sc_flush_count); GJ_DEBUG(3, "Report: flush in progress=%d", sc->sc_flush_in_progress); GJ_DEBUG(3, "Report: copy in progress=%d", sc->sc_copy_in_progress); GJ_DEBUG(3, "Report: delayed=%d", sc->sc_delayed_count); } } GJ_DEBUG(3, "%s: exit 1", __func__); return; } /* * Flush even not full records every 3 seconds. */ timeout = (last_write + 3 - time_second) * hz; if (timeout <= 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 2", __func__); return; } error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); if (error == EWOULDBLOCK) g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 3", __func__); } /* * Worker thread. */ static void g_journal_worker(void *arg) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *bp; time_t last_write; int type; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sc = arg; type = 0; /* gcc */ if (sc->sc_flags & GJF_DEVICE_CLEAN) { GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); g_journal_initialize(sc); } else { g_journal_sync(sc); } /* * Check if we can use BIO_FLUSH. */ sc->sc_bio_flush = 0; if (g_io_flush(sc->sc_jconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_jconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_jconsumer->provider->name); } if (sc->sc_jconsumer != sc->sc_dconsumer) { if (g_io_flush(sc->sc_dconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_DATA; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_dconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_dconsumer->provider->name); } } gp = sc->sc_geom; g_topology_lock(); pp = g_new_providerf(gp, "%s.journal", sc->sc_name); pp->mediasize = sc->sc_mediasize; /* * There could be a problem when data provider and journal providers * have different sectorsize, but such scenario is prevented on journal * creation. */ pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); g_topology_unlock(); last_write = time_second; if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } for (;;) { /* Get first request from the queue. */ mtx_lock(&sc->sc_mtx); bp = bioq_first(&sc->sc_back_queue); if (bp != NULL) type = (bp->bio_cflags & GJ_BIO_MASK); if (bp == NULL) { bp = bioq_first(&sc->sc_regular_queue); if (bp != NULL) type = GJ_BIO_REGULAR; } if (bp == NULL) { try_switch: if ((sc->sc_flags & GJF_DEVICE_SWITCH) || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (sc->sc_current_count > 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); continue; } if (sc->sc_flush_in_progress > 0) goto sleep; if (sc->sc_copy_in_progress > 0) goto sleep; } if (sc->sc_flags & GJF_DEVICE_SWITCH) { mtx_unlock(&sc->sc_mtx); g_journal_switch(sc); wakeup(&sc->sc_journal_copying); continue; } if (sc->sc_flags & GJF_DEVICE_DESTROY) { GJ_DEBUG(1, "Shutting down worker " "thread for %s.", gp->name); sc->sc_worker = NULL; wakeup(&sc->sc_worker); mtx_unlock(&sc->sc_mtx); kproc_exit(0); } sleep: g_journal_wait(sc, last_write); continue; } /* * If we're in switch process, we need to delay all new * write requests until its done. */ if ((sc->sc_flags & GJF_DEVICE_SWITCH) && type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { GJ_LOGREQ(2, bp, "WRITE on SWITCH"); goto try_switch; } if (type == GJ_BIO_REGULAR) bioq_remove(&sc->sc_regular_queue, bp); else bioq_remove(&sc->sc_back_queue, bp); mtx_unlock(&sc->sc_mtx); switch (type) { case GJ_BIO_REGULAR: /* Regular request. */ switch (bp->bio_cmd) { case BIO_READ: g_journal_read(sc, bp, bp->bio_offset, bp->bio_offset + bp->bio_length); break; case BIO_WRITE: last_write = time_second; g_journal_add_request(sc, bp); g_journal_flush_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_COPY: switch (bp->bio_cmd) { case BIO_READ: if (g_journal_copy_read_done(bp)) g_journal_copy_send(sc); break; case BIO_WRITE: g_journal_copy_write_done(bp); g_journal_copy_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_JOURNAL: g_journal_flush_done(bp); g_journal_flush_send(sc); break; case GJ_BIO_READ: default: panic("Invalid bio (%d).", type); } } } static void g_journal_destroy_event(void *arg, int flags __unused) { struct g_journal_softc *sc; g_topology_assert(); sc = arg; g_journal_destroy(sc); } static void g_journal_timeout(void *arg) { struct g_journal_softc *sc; sc = arg; GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", sc->sc_geom->name); g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); } static struct g_geom * g_journal_create(struct g_class *mp, struct g_provider *pp, const struct g_journal_metadata *md) { struct g_journal_softc *sc; struct g_geom *gp; struct g_consumer *cp; int error; sc = NULL; /* gcc */ g_topology_assert(); /* * There are two possibilities: * 1. Data and both journals are on the same provider. * 2. Data and journals are all on separated providers. */ /* Look for journal device with the same ID. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_id == md->md_id) break; } if (gp == NULL) sc = NULL; else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); return (NULL); } if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { GJ_DEBUG(0, "Invalid type on %s.", pp->name); return (NULL); } if (md->md_type & GJ_TYPE_DATA) { GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, pp->name); } if (md->md_type & GJ_TYPE_JOURNAL) { GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, pp->name); } if (sc == NULL) { /* Action geom. */ sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); sc->sc_id = md->md_id; sc->sc_type = 0; sc->sc_flags = 0; sc->sc_worker = NULL; gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); gp->start = g_journal_start; gp->orphan = g_journal_orphan; gp->access = g_journal_access; gp->softc = sc; gp->flags |= G_GEOM_VOLATILE_BIO; sc->sc_geom = gp; mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); bioq_init(&sc->sc_back_queue); bioq_init(&sc->sc_regular_queue); bioq_init(&sc->sc_delayed_queue); sc->sc_delayed_count = 0; sc->sc_current_queue = NULL; sc->sc_current_count = 0; sc->sc_flush_queue = NULL; sc->sc_flush_count = 0; sc->sc_flush_in_progress = 0; sc->sc_copy_queue = NULL; sc->sc_copy_in_progress = 0; sc->sc_inactive.jj_queue = NULL; sc->sc_active.jj_queue = NULL; sc->sc_rootmount = root_mount_hold("GJOURNAL"); GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); callout_init(&sc->sc_callout, 1); if (md->md_type != GJ_TYPE_COMPLETE) { /* * Journal and data are on separate providers. * At this point we have only one of them. * We setup a timeout in case the other part will not * appear, so we won't wait forever. */ callout_reset(&sc->sc_callout, 5 * hz, g_journal_timeout, sc); } } /* Remember type of the data provider. */ if (md->md_type & GJ_TYPE_DATA) sc->sc_orig_type = md->md_type; sc->sc_type |= md->md_type; cp = NULL; if (md->md_type & GJ_TYPE_DATA) { if (md->md_flags & GJ_FLAG_CLEAN) sc->sc_flags |= GJF_DEVICE_CLEAN; if (md->md_flags & GJ_FLAG_CHECKSUM) sc->sc_flags |= GJF_DEVICE_CHECKSUM; cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } sc->sc_dconsumer = cp; sc->sc_mediasize = pp->mediasize - pp->sectorsize; sc->sc_sectorsize = pp->sectorsize; sc->sc_jstart = md->md_jstart; sc->sc_jend = md->md_jend; if (md->md_provider[0] != '\0') sc->sc_flags |= GJF_DEVICE_HARDCODED; sc->sc_journal_offset = md->md_joffset; sc->sc_journal_id = md->md_jid; sc->sc_journal_previous_id = md->md_jid; } if (md->md_type & GJ_TYPE_JOURNAL) { if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } } else { /* * Journal is on the same provider as data, which means * that data provider ends where journal starts. */ sc->sc_mediasize = md->md_jstart; } sc->sc_jconsumer = cp; } /* Start switcher kproc if needed. */ if (g_journal_switcher_proc == NULL) g_journal_start_switcher(mp); if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { /* Journal is not complete yet. */ return (gp); } else { /* Journal complete, cancel timeout. */ callout_drain(&sc->sc_callout); } error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, "g_journal %s", sc->sc_name); if (error != 0) { GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", sc->sc_name); g_journal_destroy(sc); return (NULL); } return (gp); } static void g_journal_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; g_detach(cp); g_destroy_consumer(cp); } static int g_journal_destroy(struct g_journal_softc *sc) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_error_provider(pp, ENXIO); g_journal_flush(sc); g_journal_flush_send(sc); g_journal_switch(sc); } sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); g_topology_unlock(); if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); wakeup(sc); while (sc->sc_worker != NULL) msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); mtx_unlock(&sc->sc_mtx); if (pp != NULL) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); g_topology_lock(); g_wither_provider(pp, ENXIO); } else { g_topology_lock(); } mtx_destroy(&sc->sc_mtx); if (sc->sc_current_count != 0) { GJ_DEBUG(0, "Warning! Number of current requests %d.", sc->sc_current_count); } gp->softc = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->acr + cp->acw + cp->ace > 0) g_access(cp, -1, -1, -1); /* * We keep all consumers open for writting, so if I'll detach * and destroy consumer here, I'll get providers for taste, so * journal will be started again. * Sending an event here, prevents this from happening. */ g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); } g_wither_geom(gp, ENXIO); free(sc, M_JOURNAL); return (0); } static void g_journal_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_journal_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); GJ_DEBUG(2, "Tasting %s.", pp->name); if (pp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, "journal:taste"); /* This orphan function should be never called. */ gp->orphan = g_journal_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_journal_metadata_read(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_journal_debug >= 2) journal_metadata_dump(&md); gp = g_journal_create(mp, pp, &md); return (gp); } static struct g_journal_softc * g_journal_find_device(struct g_class *mp, const char *name) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; if (strncmp(name, "/dev/", 5) == 0) name += 5; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; pp = LIST_FIRST(&gp->provider); if (strcmp(sc->sc_name, name) == 0) return (sc); if (pp != NULL && strcmp(pp->name, name) == 0) return (sc); } return (NULL); } static void g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_journal_softc *sc; const char *name; char param[16]; int *nargs; int error, i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_journal_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_journal_destroy(sc); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", LIST_FIRST(&sc->sc_geom->provider)->name, error); return; } } } static void g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) { g_topology_assert(); g_topology_unlock(); g_journal_sync_requested++; wakeup(&g_journal_switcher_state); while (g_journal_sync_requested > 0) tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); g_topology_lock(); } static void g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_JOURNAL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_journal_ctl_destroy(req, mp); return; } else if (strcmp(verb, "sync") == 0) { g_journal_ctl_sync(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_journal_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { int first = 1; sbuf_printf(sb, "%s", indent); if (cp == sc->sc_dconsumer) { sbuf_cat(sb, "Data"); first = 0; } if (cp == sc->sc_jconsumer) { if (!first) sbuf_cat(sb, ","); sbuf_cat(sb, "Journal"); } sbuf_cat(sb, "\n"); if (cp == sc->sc_jconsumer) { sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jstart); sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jend); } } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); } } static eventhandler_tag g_journal_event_shutdown = NULL; static eventhandler_tag g_journal_event_lowmem = NULL; static void g_journal_shutdown(void *arg, int howto __unused) { struct g_class *mp; struct g_geom *gp, *gp2; if (panicstr != NULL) return; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; GJ_DEBUG(0, "Shutting down geom %s.", gp->name); g_journal_destroy(gp->softc); } g_topology_unlock(); } /* * Free cached requests from inactive queue in case of low memory. * We free GJ_FREE_AT_ONCE elements at once. */ #define GJ_FREE_AT_ONCE 4 static void g_journal_lowmem(void *arg, int howto __unused) { struct g_journal_softc *sc; struct g_class *mp; struct g_geom *gp; struct bio *bp; u_int nfree = GJ_FREE_AT_ONCE; g_journal_stats_low_mem++; mp = arg; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) continue; mtx_lock(&sc->sc_mtx); for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; nfree--, bp = bp->bio_next) { /* * This is safe to free the bio_data, because: * 1. If bio_data is NULL it will be read from the * inactive journal. * 2. If bp is sent down, it is first removed from the * inactive queue, so it's impossible to free the * data from under in-flight bio. * On the other hand, freeing elements from the active * queue, is not safe. */ if (bp->bio_data != NULL) { GJ_DEBUG(2, "Freeing data from %s.", sc->sc_name); gj_free(bp->bio_data, bp->bio_length); bp->bio_data = NULL; } } mtx_unlock(&sc->sc_mtx); if (nfree == 0) break; } g_topology_unlock(); } static void g_journal_switcher(void *arg); static void g_journal_init(struct g_class *mp) { /* Pick a conservative value if provided value sucks. */ if (g_journal_cache_divisor <= 0 || (vm_kmem_size / g_journal_cache_divisor == 0)) { g_journal_cache_divisor = 5; } if (g_journal_cache_limit > 0) { g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; g_journal_cache_low = (g_journal_cache_limit / 100) * g_journal_cache_switch; } g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_shutdown == NULL) GJ_DEBUG(0, "Warning! Cannot register shutdown event."); g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_lowmem == NULL) GJ_DEBUG(0, "Warning! Cannot register lowmem event."); } static void g_journal_fini(struct g_class *mp) { if (g_journal_event_shutdown != NULL) { EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_journal_event_shutdown); } if (g_journal_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); g_journal_stop_switcher(); } DECLARE_GEOM_CLASS(g_journal_class, g_journal); static const struct g_journal_desc * g_journal_find_desc(const char *fstype) { const struct g_journal_desc *desc; int i; for (desc = g_journal_filesystems[i = 0]; desc != NULL; desc = g_journal_filesystems[++i]) { if (strcmp(desc->jd_fstype, fstype) == 0) break; } return (desc); } static void g_journal_switch_wait(struct g_journal_softc *sc) { struct bintime bt; mtx_assert(&sc->sc_mtx, MA_OWNED); if (g_journal_debug >= 2) { if (sc->sc_flush_in_progress > 0) { GJ_DEBUG(2, "%d requests flushing.", sc->sc_flush_in_progress); } if (sc->sc_copy_in_progress > 0) { GJ_DEBUG(2, "%d requests copying.", sc->sc_copy_in_progress); } if (sc->sc_flush_count > 0) { GJ_DEBUG(2, "%d requests to flush.", sc->sc_flush_count); } if (sc->sc_delayed_count > 0) { GJ_DEBUG(2, "%d requests delayed.", sc->sc_delayed_count); } } g_journal_stats_switches++; if (sc->sc_copy_in_progress > 0) g_journal_stats_wait_for_copy++; GJ_TIMER_START(1, &bt); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; sc->sc_flags |= GJF_DEVICE_SWITCH; wakeup(sc); while (sc->sc_flags & GJF_DEVICE_SWITCH) { msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, "gj:switch", 0); } GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); } static void g_journal_do_switch(struct g_class *classp) { struct g_journal_softc *sc; const struct g_journal_desc *desc; struct g_geom *gp; struct mount *mp; struct bintime bt; char *mountpoint; int error, save; g_topology_lock(); LIST_FOREACH(gp, &classp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; mtx_lock(&sc->sc_mtx); sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); } g_topology_unlock(); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_gjprovider == NULL) continue; if (mp->mnt_flag & MNT_RDONLY) continue; desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); if (desc == NULL) continue; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ g_topology_lock(); sc = g_journal_find_device(classp, mp->mnt_gjprovider); g_topology_unlock(); if (sc == NULL) { GJ_DEBUG(0, "Cannot find journal geom for %s.", mp->mnt_gjprovider); goto next; } else if (JEMPTY(sc)) { mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); goto next; } mountpoint = mp->mnt_stat.f_mntonname; error = vn_start_write(NULL, &mp, V_WAIT); if (error != 0) { GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", mountpoint, error); goto next; } save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); vfs_msync(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); error = VFS_SYNC(mp, MNT_NOWAIT); if (error == 0) GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); else { GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", mountpoint, error); } curthread_pflags_restore(save); vn_finished_write(mp); if (error != 0) goto next; /* * Send BIO_FLUSH before freezing the file system, so it can be * faster after the freeze. */ GJ_TIMER_START(1, &bt); g_journal_flush_cache(sc); GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", mountpoint, error); goto next; } error = desc->jd_clean(mp); if (error != 0) goto next; mtx_lock(&sc->sc_mtx); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); vfs_write_resume(mp, 0); next: mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); sc = NULL; for (;;) { g_topology_lock(); LIST_FOREACH(gp, &g_journal_class.geom, geom) { sc = gp->softc; if (sc == NULL) continue; mtx_lock(&sc->sc_mtx); if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && !(sc->sc_flags & GJF_DEVICE_DESTROY) && (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { break; } mtx_unlock(&sc->sc_mtx); sc = NULL; } g_topology_unlock(); if (sc == NULL) break; mtx_assert(&sc->sc_mtx, MA_OWNED); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); } } static void g_journal_start_switcher(struct g_class *mp) { int error; g_topology_assert(); MPASS(g_journal_switcher_proc == NULL); g_journal_switcher_state = GJ_SWITCHER_WORKING; error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc, 0, 0, "g_journal switcher"); KASSERT(error == 0, ("Cannot create switcher thread.")); } static void g_journal_stop_switcher(void) { g_topology_assert(); MPASS(g_journal_switcher_proc != NULL); g_journal_switcher_state = GJ_SWITCHER_DIE; wakeup(&g_journal_switcher_state); while (g_journal_switcher_state != GJ_SWITCHER_DIED) tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); GJ_DEBUG(1, "Switcher died."); g_journal_switcher_proc = NULL; } /* * TODO: Kill switcher thread on last geom destruction? */ static void g_journal_switcher(void *arg) { struct g_class *mp; struct bintime bt; int error; mp = arg; curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { g_journal_switcher_wokenup = 0; error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", g_journal_switch_time * hz); if (g_journal_switcher_state == GJ_SWITCHER_DIE) { g_journal_switcher_state = GJ_SWITCHER_DIED; GJ_DEBUG(1, "Switcher exiting."); wakeup(&g_journal_switcher_state); kproc_exit(0); } if (error == 0 && g_journal_sync_requested == 0) { GJ_DEBUG(1, "Out of cache, force switch (used=%jd " "limit=%jd).", (intmax_t)g_journal_cache_used, (intmax_t)g_journal_cache_limit); } GJ_TIMER_START(1, &bt); g_journal_do_switch(mp); GJ_TIMER_STOP(1, &bt, "Entire switch time"); if (g_journal_sync_requested > 0) { g_journal_sync_requested = 0; wakeup(&g_journal_sync_requested); } } } Index: head/sys/geom/journal/g_journal.h =================================================================== --- head/sys/geom/journal/g_journal.h (revision 350693) +++ head/sys/geom/journal/g_journal.h (revision 350694) @@ -1,394 +1,376 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_JOURNAL_H_ #define _G_JOURNAL_H_ #include #include #ifdef _KERNEL #include #endif #define G_JOURNAL_CLASS_NAME "JOURNAL" #define G_JOURNAL_MAGIC "GEOM::JOURNAL" /* * Version history: * 0 - Initial version number. */ #define G_JOURNAL_VERSION 0 #ifdef _KERNEL extern int g_journal_debug; -#define GJ_DEBUG(lvl, ...) do { \ - if (g_journal_debug >= (lvl)) { \ - printf("GEOM_JOURNAL"); \ - if (g_journal_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define GJ_LOGREQ(lvl, bp, ...) do { \ - if (g_journal_debug >= (lvl)) { \ - printf("GEOM_JOURNAL"); \ - if (g_journal_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define GJ_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_JOURNAL", g_journal_debug, (lvl), NULL, __VA_ARGS__) +#define GJ_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_JOURNAL", g_journal_debug, (lvl), (bp), __VA_ARGS__) #define JEMPTY(sc) ((sc)->sc_journal_offset - \ (sc)->sc_jprovider->sectorsize == \ (sc)->sc_active.jj_offset && \ (sc)->sc_current_count == 0) #define GJ_BIO_REGULAR 0x00 #define GJ_BIO_READ 0x01 #define GJ_BIO_JOURNAL 0x02 #define GJ_BIO_COPY 0x03 #define GJ_BIO_MASK 0x0f #if 0 #define GJF_BIO_DONT_FREE 0x10 #define GJF_BIO_MASK 0xf0 #endif #define GJF_DEVICE_HARDCODED 0x0001 #define GJF_DEVICE_DESTROY 0x0010 #define GJF_DEVICE_SWITCH 0x0020 #define GJF_DEVICE_BEFORE_SWITCH 0x0040 #define GJF_DEVICE_CLEAN 0x0080 #define GJF_DEVICE_CHECKSUM 0x0100 #define GJ_HARD_LIMIT 64 /* * We keep pointers to journaled data in bio structure and because we * need to store two off_t values (offset in data provider and offset in * journal), we have to borrow bio_completed field for this. */ #define bio_joffset bio_completed /* * Use bio_caller1 field as a pointer in queue. */ #define bio_next bio_caller1 /* * There are two such structures maintained inside each journaled device. * One describes active part of the journal, were recent requests are stored. * The second describes the last consistent part of the journal with requests * that are copied to the destination provider. */ struct g_journal_journal { struct bio *jj_queue; /* Cached journal entries. */ off_t jj_offset; /* Journal's start offset. */ }; struct g_journal_softc { uint32_t sc_id; uint8_t sc_type; uint8_t sc_orig_type; struct g_geom *sc_geom; u_int sc_flags; struct mtx sc_mtx; off_t sc_mediasize; u_int sc_sectorsize; #define GJ_FLUSH_DATA 0x01 #define GJ_FLUSH_JOURNAL 0x02 u_int sc_bio_flush; uint32_t sc_journal_id; uint32_t sc_journal_next_id; int sc_journal_copying; off_t sc_journal_offset; off_t sc_journal_previous_id; struct bio_queue_head sc_back_queue; struct bio_queue_head sc_regular_queue; struct bio_queue_head sc_delayed_queue; int sc_delayed_count; struct bio *sc_current_queue; int sc_current_count; struct bio *sc_flush_queue; int sc_flush_count; int sc_flush_in_progress; struct bio *sc_copy_queue; int sc_copy_in_progress; struct g_consumer *sc_dconsumer; struct g_consumer *sc_jconsumer; struct g_journal_journal sc_inactive; struct g_journal_journal sc_active; off_t sc_jstart; /* Journal space start offset. */ off_t sc_jend; /* Journal space end offset. */ struct callout sc_callout; struct proc *sc_worker; struct root_hold_token *sc_rootmount; }; #define sc_dprovider sc_dconsumer->provider #define sc_jprovider sc_jconsumer->provider #define sc_name sc_dprovider->name #define GJQ_INSERT_HEAD(head, bp) do { \ (bp)->bio_next = (head); \ (head) = (bp); \ } while (0) #define GJQ_INSERT_AFTER(head, bp, pbp) do { \ if ((pbp) == NULL) \ GJQ_INSERT_HEAD(head, bp); \ else { \ (bp)->bio_next = (pbp)->bio_next; \ (pbp)->bio_next = (bp); \ } \ } while (0) #define GJQ_LAST(head, bp) do { \ struct bio *_bp; \ \ if ((head) == NULL) { \ (bp) = (head); \ break; \ } \ for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) \ continue; \ (bp) = (_bp); \ } while (0) #define GJQ_FIRST(head) (head) #define GJQ_REMOVE(head, bp) do { \ struct bio *_bp; \ \ if ((head) == (bp)) { \ (head) = (bp)->bio_next; \ (bp)->bio_next = NULL; \ break; \ } \ for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) {\ if (_bp->bio_next == (bp)) \ break; \ } \ KASSERT(_bp->bio_next != NULL, ("NULL bio_next")); \ KASSERT(_bp->bio_next == (bp), ("bio_next != bp")); \ _bp->bio_next = (bp)->bio_next; \ (bp)->bio_next = NULL; \ } while (0) #define GJQ_FOREACH(head, bp) \ for ((bp) = (head); (bp) != NULL; (bp) = (bp)->bio_next) #define GJ_HEADER_MAGIC "GJHDR" struct g_journal_header { char jh_magic[sizeof(GJ_HEADER_MAGIC)]; uint32_t jh_journal_id; uint32_t jh_journal_next_id; } __packed; struct g_journal_entry { uint64_t je_joffset; uint64_t je_offset; uint64_t je_length; } __packed; #define GJ_RECORD_HEADER_MAGIC "GJRHDR" #define GJ_RECORD_HEADER_NENTRIES (20) #define GJ_RECORD_MAX_SIZE(sc) \ ((sc)->sc_jprovider->sectorsize + GJ_RECORD_HEADER_NENTRIES * MAXPHYS) #define GJ_VALIDATE_OFFSET(offset, sc) do { \ if ((offset) + GJ_RECORD_MAX_SIZE(sc) >= (sc)->sc_jend) { \ (offset) = (sc)->sc_jstart; \ GJ_DEBUG(2, "Starting from the beginning (%s).", \ (sc)->sc_name); \ } \ } while (0) struct g_journal_record_header { char jrh_magic[sizeof(GJ_RECORD_HEADER_MAGIC)]; uint32_t jrh_journal_id; uint16_t jrh_nentries; u_char jrh_sum[8]; struct g_journal_entry jrh_entries[GJ_RECORD_HEADER_NENTRIES]; } __packed; typedef int (g_journal_clean_t)(struct mount *mp); typedef void (g_journal_dirty_t)(struct g_consumer *cp); struct g_journal_desc { const char *jd_fstype; g_journal_clean_t *jd_clean; g_journal_dirty_t *jd_dirty; }; /* Supported file systems. */ extern const struct g_journal_desc g_journal_ufs; #define GJ_TIMER_START(lvl, bt) do { \ if (g_journal_debug >= (lvl)) \ binuptime(bt); \ } while (0) #define GJ_TIMER_STOP(lvl, bt, ...) do { \ if (g_journal_debug >= (lvl)) { \ struct bintime _bt2; \ struct timeval _tv; \ \ binuptime(&_bt2); \ bintime_sub(&_bt2, bt); \ bintime2timeval(&_bt2, &_tv); \ printf("GEOM_JOURNAL"); \ if (g_journal_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(": %jd.%06jds\n", (intmax_t)_tv.tv_sec, \ (intmax_t)_tv.tv_usec); \ } \ } while (0) #endif /* _KERNEL */ #define GJ_TYPE_DATA 0x01 #define GJ_TYPE_JOURNAL 0x02 #define GJ_TYPE_COMPLETE (GJ_TYPE_DATA|GJ_TYPE_JOURNAL) #define GJ_FLAG_CLEAN 0x01 #define GJ_FLAG_CHECKSUM 0x02 struct g_journal_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ uint32_t md_id; /* Journal unique ID. */ uint8_t md_type; /* Provider type. */ uint64_t md_jstart; /* Journal space start offset. */ uint64_t md_jend; /* Journal space end offset. */ uint64_t md_joffset; /* Last known consistent journal offset. */ uint32_t md_jid; /* Last known consistent journal ID. */ uint64_t md_flags; /* Journal flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void journal_metadata_encode(struct g_journal_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); le32enc(data + 20, md->md_id); *(data + 24) = md->md_type; le64enc(data + 25, md->md_jstart); le64enc(data + 33, md->md_jend); le64enc(data + 41, md->md_joffset); le32enc(data + 49, md->md_jid); le64enc(data + 53, md->md_flags); bcopy(md->md_provider, data + 61, 16); le64enc(data + 77, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 85); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 85, 16); } static __inline int journal_metadata_decode_v0(const u_char *data, struct g_journal_metadata *md) { MD5_CTX ctx; md->md_id = le32dec(data + 20); md->md_type = *(data + 24); md->md_jstart = le64dec(data + 25); md->md_jend = le64dec(data + 33); md->md_joffset = le64dec(data + 41); md->md_jid = le32dec(data + 49); md->md_flags = le64dec(data + 53); bcopy(data + 61, md->md_provider, 16); md->md_provsize = le64dec(data + 77); MD5Init(&ctx); MD5Update(&ctx, data, 85); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 85, 16) != 0) return (EINVAL); return (0); } static __inline int journal_metadata_decode(const u_char *data, struct g_journal_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: error = journal_metadata_decode_v0(data, md); break; default: error = EINVAL; break; } return (error); } static __inline void journal_metadata_dump(const struct g_journal_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" id: %u\n", (u_int)md->md_id); printf(" type: %u\n", (u_int)md->md_type); printf(" start: %ju\n", (uintmax_t)md->md_jstart); printf(" end: %ju\n", (uintmax_t)md->md_jend); printf(" joffset: %ju\n", (uintmax_t)md->md_joffset); printf(" jid: %u\n", (u_int)md->md_jid); printf(" flags: %u\n", (u_int)md->md_flags); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_JOURNAL_H_ */ Index: head/sys/geom/journal/g_journal_ufs.c =================================================================== --- head/sys/geom/journal/g_journal_ufs.c (revision 350693) +++ head/sys/geom/journal/g_journal_ufs.c (revision 350694) @@ -1,104 +1,105 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include static int g_journal_ufs_clean(struct mount *mp) { struct ufsmount *ump; struct fs *fs; int flags; ump = VFSTOUFS(mp); fs = ump->um_fs; flags = fs->fs_flags; fs->fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); ffs_sbupdate(ump, MNT_WAIT, 1); fs->fs_flags = flags; return (0); } static void g_journal_ufs_dirty(struct g_consumer *cp) { struct fs *fs; int error; fs = NULL; if (SBLOCKSIZE % cp->provider->sectorsize != 0 || ffs_sbget(cp, &fs, STDSB, M_GEOM, g_use_g_read_data) != 0) { GJ_DEBUG(0, "Cannot find superblock to mark file system %s " "as dirty.", cp->provider->name); KASSERT(fs == NULL, ("g_journal_ufs_dirty: non-NULL fs %p\n", fs)); return; } GJ_DEBUG(0, "clean=%d flags=0x%x", fs->fs_clean, fs->fs_flags); fs->fs_clean = 0; fs->fs_flags |= FS_NEEDSFSCK | FS_UNCLEAN; error = ffs_sbput(cp, fs, fs->fs_sblockloc, g_use_g_write_data); g_free(fs->fs_csp); g_free(fs); if (error != 0) { GJ_DEBUG(0, "Cannot mark file system %s as dirty " "(error=%d).", cp->provider->name, error); } else { GJ_DEBUG(0, "File system %s marked as dirty.", cp->provider->name); } } const struct g_journal_desc g_journal_ufs = { .jd_fstype = "ufs", .jd_clean = g_journal_ufs_clean, .jd_dirty = g_journal_ufs_dirty }; MODULE_DEPEND(g_journal, ufs, 1, 1, 1); MODULE_VERSION(geom_journal, 0); Index: head/sys/geom/label/g_label.c =================================================================== --- head/sys/geom/label/g_label.c (revision 350693) +++ head/sys/geom/label/g_label.c (revision 350694) @@ -1,560 +1,561 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include FEATURE(geom_label, "GEOM labeling support"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, label, CTLFLAG_RW, 0, "GEOM_LABEL stuff"); u_int g_label_debug = 0; SYSCTL_UINT(_kern_geom_label, OID_AUTO, debug, CTLFLAG_RWTUN, &g_label_debug, 0, "Debug level"); static int g_label_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static int g_label_destroy(struct g_geom *gp, boolean_t force); static struct g_geom *g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb); struct g_class g_label_class = { .name = G_LABEL_CLASS_NAME, .version = G_VERSION, .ctlreq = g_label_config, .taste = g_label_taste, .destroy_geom = g_label_destroy_geom }; /* * To add a new file system where you want to look for volume labels, * you have to: * 1. Add a file g_label_.c which implements labels recognition. * 2. Add an 'extern const struct g_label_desc g_label_;' into * g_label.h file. * 3. Add an element to the table below '&g_label_,'. * 4. Add your file to sys/conf/files. * 5. Add your file to sys/modules/geom/geom_label/Makefile. * 6. Add your file system to manual page sbin/geom/class/label/glabel.8. */ const struct g_label_desc *g_labels[] = { &g_label_gpt, &g_label_gpt_uuid, #ifdef GEOM_LABEL &g_label_ufs_id, &g_label_ufs_volume, &g_label_iso9660, &g_label_msdosfs, &g_label_ext2fs, &g_label_reiserfs, &g_label_ntfs, &g_label_disk_ident, &g_label_flashmap, #endif NULL }; void g_label_rtrim(char *label, size_t size) { ptrdiff_t i; for (i = size - 1; i >= 0; i--) { if (label[i] == '\0') continue; else if (label[i] == ' ') label[i] = '\0'; else break; } } static int g_label_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp __unused) { /* * XXX: Unloading a class which is using geom_slice:1.56 is currently * XXX: broken, so we deny unloading when we have geoms. */ return (EOPNOTSUPP); } static void g_label_orphan(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_orphan(cp); } static void g_label_spoiled(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_spoiled(cp); } static void g_label_resize(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s resized.", LIST_FIRST(&cp->geom->provider)->name); g_slice_config(cp->geom, 0, G_SLICE_CONFIG_FORCE, (off_t)0, cp->provider->mediasize, cp->provider->sectorsize, "notused"); } static int g_label_is_name_ok(const char *label) { const char *s; /* Check if the label starts from ../ */ if (strncmp(label, "../", 3) == 0) return (0); /* Check if the label contains /../ */ if (strstr(label, "/../") != NULL) return (0); /* Check if the label ends at ../ */ if ((s = strstr(label, "/..")) != NULL && s[3] == '\0') return (0); return (1); } static void g_label_mangle_name(char *label, size_t size) { struct sbuf *sb; const u_char *c; sb = sbuf_new(NULL, NULL, size, SBUF_FIXEDLEN); for (c = label; *c != '\0'; c++) { if (!isprint(*c) || isspace(*c) || *c =='"' || *c == '%') sbuf_printf(sb, "%%%02X", *c); else sbuf_putc(sb, *c); } if (sbuf_finish(sb) != 0) label[0] = '\0'; else strlcpy(label, sbuf_data(sb), size); sbuf_delete(sb); } static struct g_geom * g_label_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, const char *label, const char *dir, off_t mediasize) { struct g_geom *gp; struct g_provider *pp2; struct g_consumer *cp; char name[64]; g_topology_assert(); if (!g_label_is_name_ok(label)) { G_LABEL_DEBUG(0, "%s contains suspicious label, skipping.", pp->name); G_LABEL_DEBUG(1, "%s suspicious label is: %s", pp->name, label); if (req != NULL) gctl_error(req, "Label name %s is invalid.", label); return (NULL); } gp = NULL; cp = NULL; if (snprintf(name, sizeof(name), "%s/%s", dir, label) >= sizeof(name)) { if (req != NULL) gctl_error(req, "Label name %s is too long.", label); return (NULL); } LIST_FOREACH(gp, &mp->geom, geom) { pp2 = LIST_FIRST(&gp->provider); if (pp2 == NULL) continue; if ((pp2->flags & G_PF_ORPHAN) != 0) continue; if (strcmp(pp2->name, name) == 0) { G_LABEL_DEBUG(1, "Label %s(%s) already exists (%s).", label, name, pp->name); if (req != NULL) { gctl_error(req, "Provider %s already exists.", name); } return (NULL); } } gp = g_slice_new(mp, 1, pp, &cp, NULL, 0, NULL); if (gp == NULL) { G_LABEL_DEBUG(0, "Cannot create slice %s.", label); if (req != NULL) gctl_error(req, "Cannot create slice %s.", label); return (NULL); } gp->orphan = g_label_orphan; gp->spoiled = g_label_spoiled; gp->resize = g_label_resize; g_access(cp, -1, 0, 0); g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t)0, mediasize, pp->sectorsize, "%s", name); G_LABEL_DEBUG(1, "Label for provider %s is %s.", pp->name, name); return (gp); } static int g_label_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp; g_topology_assert(); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_LABEL_DEBUG(0, "Provider %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_LABEL_DEBUG(1, "Provider %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else if (pp != NULL) G_LABEL_DEBUG(1, "Label %s removed.", pp->name); g_slice_spoiled(LIST_FIRST(&gp->consumer)); return (0); } static int g_label_read_metadata(struct g_consumer *cp, struct g_label_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); if (buf == NULL) return (error); /* Decode metadata. */ label_metadata_decode(buf, md); g_free(buf); return (0); } static void g_label_orphan_taste(struct g_consumer *cp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static void g_label_start_taste(struct bio *bp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static int g_label_access_taste(struct g_provider *pp __unused, int dr __unused, int dw __unused, int de __unused) { KASSERT(1 == 0, ("%s called", __func__)); return (EOPNOTSUPP); } static struct g_geom * g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_label_metadata md; struct g_consumer *cp; struct g_geom *gp; int i; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_LABEL_DEBUG(2, "Tasting %s.", pp->name); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); if (strcmp(pp->geom->class->name, mp->name) == 0) return (NULL); gp = g_new_geomf(mp, "label:taste"); gp->start = g_label_start_taste; gp->access = g_label_access_taste; gp->orphan = g_label_orphan_taste; cp = g_new_consumer(gp); g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) goto end; do { if (g_label_read_metadata(cp, &md) != 0) break; if (strcmp(md.md_magic, G_LABEL_MAGIC) != 0) break; if (md.md_version > G_LABEL_VERSION) { printf("geom_label.ko module is too old to handle %s.\n", pp->name); break; } /* * Backward compatibility: */ /* * There was no md_provsize field in earlier versions of * metadata. */ if (md.md_version < 2) md.md_provsize = pp->mediasize; if (md.md_provsize != pp->mediasize) break; g_label_create(NULL, mp, pp, md.md_label, G_LABEL_DIR, pp->mediasize - pp->sectorsize); } while (0); for (i = 0; g_labels[i] != NULL; i++) { char label[128]; if (g_labels[i]->ld_enabled == 0) continue; g_topology_unlock(); g_labels[i]->ld_taste(cp, label, sizeof(label)); g_label_mangle_name(label, sizeof(label)); g_topology_lock(); if (label[0] == '\0') continue; g_label_create(NULL, mp, pp, label, g_labels[i]->ld_dir, pp->mediasize); } g_access(cp, -1, 0, 0); end: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static void g_label_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } /* * arg1 is the name of provider. */ name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", 1); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_LABEL_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } /* * arg0 is the label. */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", 0); return; } g_label_create(req, mp, pp, name, G_LABEL_DIR, pp->mediasize); } static const char * g_label_skip_dir(const char *name) { char path[64]; u_int i; if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); if (strncmp(name, G_LABEL_DIR "/", strlen(G_LABEL_DIR "/")) == 0) name += strlen(G_LABEL_DIR "/"); for (i = 0; g_labels[i] != NULL; i++) { snprintf(path, sizeof(path), "%s/", g_labels[i]->ld_dir); if (strncmp(name, path, strlen(path)) == 0) { name += strlen(path); break; } } return (name); } static struct g_geom * g_label_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_provider *pp; const char *pname; name = g_label_skip_dir(name); LIST_FOREACH(gp, &mp->geom, geom) { pp = LIST_FIRST(&gp->provider); pname = g_label_skip_dir(pp->name); if (strcmp(pname, name) == 0) return (gp); } return (NULL); } static void g_label_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } gp = g_label_find_geom(mp, name); if (gp == NULL) { G_LABEL_DEBUG(1, "Label %s is invalid.", name); gctl_error(req, "Label %s is invalid.", name); return; } error = g_label_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy label %s (error=%d).", LIST_FIRST(&gp->provider)->name, error); return; } } } static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_LABEL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_label_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_label_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } DECLARE_GEOM_CLASS(g_label_class, g_label); MODULE_VERSION(geom_label, 0); Index: head/sys/geom/label/g_label.h =================================================================== --- head/sys/geom/label/g_label.h (revision 350693) +++ head/sys/geom/label/g_label.h (revision 350694) @@ -1,120 +1,112 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_LABEL_H_ #define _G_LABEL_H_ #include #ifdef _KERNEL #include #endif #define G_LABEL_CLASS_NAME "LABEL" #define G_LABEL_MAGIC "GEOM::LABEL" /* * Version history: * 1 - Initial version number. * 2 - Added md_provsize field to metadata. */ #define G_LABEL_VERSION 2 #define G_LABEL_DIR "label" #ifdef _KERNEL extern u_int g_label_debug; -#define G_LABEL_DEBUG(lvl, ...) do { \ - if (g_label_debug >= (lvl)) { \ - printf("GEOM_LABEL"); \ - if (g_label_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) +#define G_LABEL_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_LABEL", g_label_debug, (lvl), NULL, __VA_ARGS__) SYSCTL_DECL(_kern_geom_label); #define G_LABEL_INIT(kind, label, descr) \ SYSCTL_NODE(_kern_geom_label, OID_AUTO, kind, CTLFLAG_RD, \ NULL, ""); \ SYSCTL_INT(_kern_geom_label_##kind, OID_AUTO, enable, \ CTLFLAG_RWTUN, &label.ld_enabled, 1, descr) typedef void g_label_taste_t (struct g_consumer *cp, char *label, size_t size); struct g_label_desc { g_label_taste_t *ld_taste; char *ld_dir; int ld_enabled; }; /* Supported labels. */ extern struct g_label_desc g_label_ufs_id; extern struct g_label_desc g_label_ufs_volume; extern struct g_label_desc g_label_iso9660; extern struct g_label_desc g_label_msdosfs; extern struct g_label_desc g_label_ext2fs; extern struct g_label_desc g_label_reiserfs; extern struct g_label_desc g_label_ntfs; extern struct g_label_desc g_label_gpt; extern struct g_label_desc g_label_gpt_uuid; extern struct g_label_desc g_label_disk_ident; extern struct g_label_desc g_label_flashmap; extern void g_label_rtrim(char *label, size_t size); #endif /* _KERNEL */ struct g_label_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_label[16]; /* Label. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void label_metadata_encode(const struct g_label_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_label, data + 20, sizeof(md->md_label)); le64enc(data + 36, md->md_provsize); } static __inline void label_metadata_decode(const u_char *data, struct g_label_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_label, sizeof(md->md_label)); md->md_provsize = le64dec(data + 36); } #endif /* _G_LABEL_H_ */ Index: head/sys/geom/label/g_label_ext2fs.c =================================================================== --- head/sys/geom/label/g_label_ext2fs.c (revision 350693) +++ head/sys/geom/label/g_label_ext2fs.c (revision 350694) @@ -1,103 +1,104 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #define EXT2FS_SB_OFFSET 1024 #define EXT2_SUPER_MAGIC 0xef53 #define EXT2_DYNAMIC_REV 1 typedef struct e2sb { uint8_t fake1[56]; uint16_t s_magic; uint8_t fake2[18]; uint32_t s_rev_level; uint8_t fake3[40]; char s_volume_name[16]; } e2sb_t; static void g_label_ext2fs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; e2sb_t *fs; char *s_volume_name; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((EXT2FS_SB_OFFSET % pp->sectorsize) != 0) return; fs = (e2sb_t *)g_read_data(cp, EXT2FS_SB_OFFSET, pp->sectorsize, NULL); if (fs == NULL) return; /* Check for magic and versio n*/ if (fs->s_magic == EXT2_SUPER_MAGIC && fs->s_rev_level == EXT2_DYNAMIC_REV) { G_LABEL_DEBUG(1, "ext2fs file system detected on %s.", pp->name); } else { goto exit_free; } s_volume_name = fs->s_volume_name; /* Terminate label */ s_volume_name[sizeof(fs->s_volume_name) - 1] = '\0'; if (s_volume_name[0] == '/') s_volume_name += 1; /* Check for volume label */ if (s_volume_name[0] == '\0') goto exit_free; strlcpy(label, s_volume_name, size); exit_free: g_free(fs); } struct g_label_desc g_label_ext2fs = { .ld_taste = g_label_ext2fs_taste, .ld_dir = "ext2fs", .ld_enabled = 1 }; G_LABEL_INIT(ext2fs, g_label_ext2fs, "Create device nodes for EXT2FS volumes"); Index: head/sys/geom/label/g_label_iso9660.c =================================================================== --- head/sys/geom/label/g_label_iso9660.c (revision 350693) +++ head/sys/geom/label/g_label_iso9660.c (revision 350694) @@ -1,81 +1,82 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #define G_LABEL_ISO9660_DIR "iso9660" #define ISO9660_MAGIC "\x01" "CD001" "\x01\x00" #define ISO9660_OFFSET 0x8000 #define VOLUME_LEN 32 static void g_label_iso9660_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; char *sector, *volume; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((ISO9660_OFFSET % pp->sectorsize) != 0) return; sector = (char *)g_read_data(cp, ISO9660_OFFSET, pp->sectorsize, NULL); if (sector == NULL) return; if (bcmp(sector, ISO9660_MAGIC, sizeof(ISO9660_MAGIC) - 1) != 0) { g_free(sector); return; } G_LABEL_DEBUG(1, "ISO9660 file system detected on %s.", pp->name); volume = sector + 0x28; bzero(label, size); strlcpy(label, volume, MIN(size, VOLUME_LEN)); g_free(sector); g_label_rtrim(label, size); } struct g_label_desc g_label_iso9660 = { .ld_taste = g_label_iso9660_taste, .ld_dir = G_LABEL_ISO9660_DIR, .ld_enabled = 1 }; G_LABEL_INIT(iso9660, g_label_iso9660, "Create device nodes for ISO9660 volume names"); Index: head/sys/geom/label/g_label_msdosfs.c =================================================================== --- head/sys/geom/label/g_label_msdosfs.c (revision 350693) +++ head/sys/geom/label/g_label_msdosfs.c (revision 350694) @@ -1,219 +1,220 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Pawel Jakub Dawidek * Copyright (c) 2006 Tobias Reifenberger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include #define G_LABEL_MSDOSFS_DIR "msdosfs" #define LABEL_NO_NAME "NO NAME " static void g_label_msdosfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; FAT_BSBPB *pfat_bsbpb; FAT32_BSBPB *pfat32_bsbpb; FAT_DES *pfat_entry; uint8_t *sector0, *sector; g_topology_assert_not(); pp = cp->provider; sector0 = NULL; sector = NULL; bzero(label, size); /* Check if the sector size of the medium is a valid FAT sector size. */ switch(pp->sectorsize) { case 512: case 1024: case 2048: case 4096: break; default: G_LABEL_DEBUG(1, "MSDOSFS: %s: sector size %d not compatible.", pp->name, pp->sectorsize); return; } /* Load 1st sector with boot sector and boot parameter block. */ sector0 = (uint8_t *)g_read_data(cp, 0, pp->sectorsize, NULL); if (sector0 == NULL) return; /* Check for the FAT boot sector signature. */ if (sector0[510] != 0x55 || sector0[511] != 0xaa) { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT signature found.", pp->name); goto error; } /* * Test if this is really a FAT volume and determine the FAT type. */ pfat_bsbpb = (FAT_BSBPB *)sector0; pfat32_bsbpb = (FAT32_BSBPB *)sector0; if (UINT16BYTES(pfat_bsbpb->BPB_FATSz16) != 0) { /* * If the BPB_FATSz16 field is not zero and the string "FAT" is * at the right place, this should be a FAT12 or FAT16 volume. */ if (strncmp(pfat_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/FAT16 volume detected.", pp->name); /* A volume with no name should have "NO NAME " as label. */ if (strncmp(pfat_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat_bsbpb->BS_VolLab)) == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume has no name.", pp->name); goto error; } strlcpy(label, pfat_bsbpb->BS_VolLab, MIN(size, sizeof(pfat_bsbpb->BS_VolLab) + 1)); } else if (UINT32BYTES(pfat32_bsbpb->BPB_FATSz32) != 0) { uint32_t fat_FirstDataSector, fat_BytesPerSector, offset; /* * If the BPB_FATSz32 field is not zero and the string "FAT" is * at the right place, this should be a FAT32 volume. */ if (strncmp(pfat32_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume detected.", pp->name); /* * If the volume label is not "NO NAME " we're done. */ if (strncmp(pfat32_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat32_bsbpb->BS_VolLab)) != 0) { strlcpy(label, pfat32_bsbpb->BS_VolLab, MIN(size, sizeof(pfat32_bsbpb->BS_VolLab) + 1)); goto endofchecks; } /* * If the volume label "NO NAME " is in the boot sector, the * label of FAT32 volumes may be stored as a special entry in * the root directory. */ fat_FirstDataSector = UINT16BYTES(pfat32_bsbpb->BPB_RsvdSecCnt) + (pfat32_bsbpb->BPB_NumFATs * UINT32BYTES(pfat32_bsbpb->BPB_FATSz32)); fat_BytesPerSector = UINT16BYTES(pfat32_bsbpb->BPB_BytsPerSec); G_LABEL_DEBUG(2, "MSDOSFS: FAT_FirstDataSector=0x%x, FAT_BytesPerSector=%d", fat_FirstDataSector, fat_BytesPerSector); for (offset = fat_BytesPerSector * fat_FirstDataSector;; offset += fat_BytesPerSector) { sector = (uint8_t *)g_read_data(cp, offset, fat_BytesPerSector, NULL); if (sector == NULL) goto error; pfat_entry = (FAT_DES *)sector; do { /* No more entries available. */ if (pfat_entry->DIR_Name[0] == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: " "FAT32 volume has no name.", pp->name); goto error; } /* Skip empty or long name entries. */ if (pfat_entry->DIR_Name[0] == 0xe5 || (pfat_entry->DIR_Attr & FAT_DES_ATTR_LONG_NAME) == FAT_DES_ATTR_LONG_NAME) { continue; } /* * The name of the entry is the volume label if * ATTR_VOLUME_ID is set. */ if (pfat_entry->DIR_Attr & FAT_DES_ATTR_VOLUME_ID) { strlcpy(label, pfat_entry->DIR_Name, MIN(size, sizeof(pfat_entry->DIR_Name) + 1)); goto endofchecks; } } while((uint8_t *)(++pfat_entry) < (uint8_t *)(sector + fat_BytesPerSector)); g_free(sector); } } else { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT volume detected.", pp->name); goto error; } endofchecks: g_label_rtrim(label, size); error: if (sector0 != NULL) g_free(sector0); if (sector != NULL) g_free(sector); } struct g_label_desc g_label_msdosfs = { .ld_taste = g_label_msdosfs_taste, .ld_dir = G_LABEL_MSDOSFS_DIR, .ld_enabled = 1 }; G_LABEL_INIT(msdosfs, g_label_msdosfs, "Create device nodes for MSDOSFS volumes"); Index: head/sys/geom/label/g_label_reiserfs.c =================================================================== --- head/sys/geom/label/g_label_reiserfs.c (revision 350693) +++ head/sys/geom/label/g_label_reiserfs.c (revision 350694) @@ -1,122 +1,123 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #define REISERFS_NEW_DISK_OFFSET 64 * 1024 #define REISERFS_OLD_DISK_OFFSET 8 * 1024 #define REISERFS_SUPER_MAGIC "ReIsEr" typedef struct reiserfs_sb { uint8_t fake1[52]; char s_magic[10]; uint8_t fake2[10]; uint16_t s_version; uint8_t fake3[26]; char s_volume_name[16]; } reiserfs_sb_t; static reiserfs_sb_t * g_label_reiserfs_read_super(struct g_consumer *cp, off_t offset) { reiserfs_sb_t *fs; u_int secsize; secsize = cp->provider->sectorsize; if ((offset % secsize) != 0) return (NULL); fs = (reiserfs_sb_t *)g_read_data(cp, offset, secsize, NULL); if (fs == NULL) return (NULL); if (strncmp(fs->s_magic, REISERFS_SUPER_MAGIC, strlen(REISERFS_SUPER_MAGIC)) != 0) { g_free(fs); return (NULL); } return (fs); } static void g_label_reiserfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; reiserfs_sb_t *fs; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; /* Try old format */ fs = g_label_reiserfs_read_super(cp, REISERFS_OLD_DISK_OFFSET); if (fs == NULL) { /* Try new format */ fs = g_label_reiserfs_read_super(cp, REISERFS_NEW_DISK_OFFSET); } if (fs == NULL) return; /* Check version */ if (fs->s_version == 2) { G_LABEL_DEBUG(1, "reiserfs file system detected on %s.", pp->name); } else { goto exit_free; } /* Check for volume label */ if (fs->s_volume_name[0] == '\0') goto exit_free; /* Terminate label */ fs->s_volume_name[sizeof(fs->s_volume_name) - 1] = '\0'; strlcpy(label, fs->s_volume_name, size); exit_free: g_free(fs); } struct g_label_desc g_label_reiserfs = { .ld_taste = g_label_reiserfs_taste, .ld_dir = "reiserfs", .ld_enabled = 1 }; G_LABEL_INIT(reiserfs, g_label_reiserfs, "Create device nodes for REISERFS volumes"); Index: head/sys/geom/label/g_label_ufs.c =================================================================== --- head/sys/geom/label/g_label_ufs.c (revision 350693) +++ head/sys/geom/label/g_label_ufs.c (revision 350694) @@ -1,156 +1,157 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002, 2003 Gordon Tetlow * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #define G_LABEL_UFS_VOLUME_DIR "ufs" #define G_LABEL_UFS_ID_DIR "ufsid" #define G_LABEL_UFS_VOLUME 0 #define G_LABEL_UFS_ID 1 /* * G_LABEL_UFS_CMP returns true if difference between provider mediasize * and filesystem size is less than G_LABEL_UFS_MAXDIFF sectors */ #define G_LABEL_UFS_CMP(prov, fsys, size) \ ( abs( ((fsys)->size) - ( (prov)->mediasize / (fsys)->fs_fsize )) \ < G_LABEL_UFS_MAXDIFF ) #define G_LABEL_UFS_MAXDIFF 0x100 /* * Try to find a superblock on the provider. If successful, then * check that the size in the superblock corresponds to the size * of the underlying provider. Finally, look for a volume label * and create an appropriate provider based on that. */ static void g_label_ufs_taste_common(struct g_consumer *cp, char *label, size_t size, int what) { struct g_provider *pp; struct fs *fs; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; fs = NULL; if (SBLOCKSIZE % pp->sectorsize != 0 || ffs_sbget(cp, &fs, STDSB_NOHASHFAIL, M_GEOM, g_use_g_read_data) != 0) { KASSERT(fs == NULL, ("g_label_ufs_taste_common: non-NULL fs %p\n", fs)); return; } /* * Check for magic. We also need to check if file system size * is almost equal to providers size, because sysinstall(8) * used to bogusly put first partition at offset 0 * instead of 16, and glabel/ufs would find file system on slice * instead of partition. * * In addition, media size can be a bit bigger than file system * size. For instance, mkuzip can append bytes to align data * to large sector size (it improves compression rates). */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_fsize > 0 && ( G_LABEL_UFS_CMP(pp, fs, fs_old_size) || G_LABEL_UFS_CMP(pp, fs, fs_providersize))) { /* Valid UFS1. */ } else if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_fsize > 0 && ( G_LABEL_UFS_CMP(pp, fs, fs_size) || G_LABEL_UFS_CMP(pp, fs, fs_providersize))) { /* Valid UFS2. */ } else { goto out; } G_LABEL_DEBUG(1, "%s file system detected on %s.", fs->fs_magic == FS_UFS1_MAGIC ? "UFS1" : "UFS2", pp->name); switch (what) { case G_LABEL_UFS_VOLUME: /* Check for volume label */ if (fs->fs_volname[0] != '\0') strlcpy(label, fs->fs_volname, size); break; case G_LABEL_UFS_ID: if (fs->fs_id[0] != 0 || fs->fs_id[1] != 0) snprintf(label, size, "%08x%08x", fs->fs_id[0], fs->fs_id[1]); break; } out: g_free(fs->fs_csp); g_free(fs); } static void g_label_ufs_volume_taste(struct g_consumer *cp, char *label, size_t size) { g_label_ufs_taste_common(cp, label, size, G_LABEL_UFS_VOLUME); } static void g_label_ufs_id_taste(struct g_consumer *cp, char *label, size_t size) { g_label_ufs_taste_common(cp, label, size, G_LABEL_UFS_ID); } struct g_label_desc g_label_ufs_volume = { .ld_taste = g_label_ufs_volume_taste, .ld_dir = G_LABEL_UFS_VOLUME_DIR, .ld_enabled = 1 }; struct g_label_desc g_label_ufs_id = { .ld_taste = g_label_ufs_id_taste, .ld_dir = G_LABEL_UFS_ID_DIR, .ld_enabled = 1 }; G_LABEL_INIT(ufsid, g_label_ufs_id, "Create device nodes for UFS file system IDs"); G_LABEL_INIT(ufs, g_label_ufs_volume, "Create device nodes for UFS volume names"); MODULE_DEPEND(g_label, ufs, 1, 1, 1); Index: head/sys/geom/linux_lvm/g_linux_lvm.c =================================================================== --- head/sys/geom/linux_lvm/g_linux_lvm.c (revision 350693) +++ head/sys/geom/linux_lvm/g_linux_lvm.c (revision 350694) @@ -1,1193 +1,1194 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #include FEATURE(geom_linux_lvm, "GEOM Linux LVM partitioning support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GLLVM, "gllvm", "GEOM_LINUX_LVM Data"); /* GEOM class methods */ static g_access_t g_llvm_access; static g_init_t g_llvm_init; static g_orphan_t g_llvm_orphan; static g_orphan_t g_llvm_taste_orphan; static g_start_t g_llvm_start; static g_taste_t g_llvm_taste; static g_ctl_destroy_geom_t g_llvm_destroy_geom; static void g_llvm_done(struct bio *); static void g_llvm_remove_disk(struct g_llvm_vg *, struct g_consumer *); static int g_llvm_activate_lv(struct g_llvm_vg *, struct g_llvm_lv *); static int g_llvm_add_disk(struct g_llvm_vg *, struct g_provider *, char *); static void g_llvm_free_vg(struct g_llvm_vg *); static int g_llvm_destroy(struct g_llvm_vg *, int); static int g_llvm_read_label(struct g_consumer *, struct g_llvm_label *); static int g_llvm_read_md(struct g_consumer *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_label_decode(const u_char *, struct g_llvm_label *, int); static int llvm_md_decode(const u_char *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_textconf_decode(u_char *, int, struct g_llvm_metadata *); static int llvm_textconf_decode_pv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_lv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_sg(char **, char *, struct g_llvm_lv *); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, linux_lvm, CTLFLAG_RW, 0, "GEOM_LINUX_LVM stuff"); static u_int g_llvm_debug = 0; SYSCTL_UINT(_kern_geom_linux_lvm, OID_AUTO, debug, CTLFLAG_RWTUN, &g_llvm_debug, 0, "Debug level"); LIST_HEAD(, g_llvm_vg) vg_list; /* * Called to notify geom when it's been opened, and for what intent */ static int g_llvm_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_llvm_vg *vg; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); vg = gp->softc; if (vg == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ G_LLVM_DEBUG(0, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) G_LLVM_DEBUG(0, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) de--; error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_llvm_done(struct bio *b) { struct bio *parent_b; parent_b = b->bio_parent; if (b->bio_error != 0) { G_LLVM_DEBUG(0, "Error %d for offset=%ju, length=%ju on %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } static void g_llvm_start(struct bio *bp) { struct g_provider *pp; struct g_llvm_vg *vg; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; struct bio *cb; struct bio_queue_head bq; size_t chunk_size; off_t offset, length; char *addr; u_int count; pp = bp->bio_to; lv = pp->private; vg = pp->geom->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* XXX BIO_GETATTR allowed? */ break; default: g_io_deliver(bp, EOPNOTSUPP); return; } bioq_init(&bq); chunk_size = vg->vg_extentsize; addr = bp->bio_data; offset = bp->bio_offset; /* virtual offset and length */ length = bp->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; pv = NULL; cb = g_clone_bio(bp); if (cb == NULL) { bioq_dismantle(&bq); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* get the segment and the pv */ if (lv->lv_sgcount == 1) { /* skip much of the calculations for a single sg */ chunk_index = 0; in_chunk_offset = 0; in_chunk_length = length; sg = lv->lv_firstsg; pv = sg->sg_pv; cb->bio_offset = offset + sg->sg_pvoffset; } else { chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); /* XXX could be faster */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (chunk_index >= sg->sg_start && chunk_index <= sg->sg_end) { /* adjust chunk index for sg start */ chunk_index -= sg->sg_start; pv = sg->sg_pv; break; } } cb->bio_offset = (off_t)chunk_index * (off_t)chunk_size + in_chunk_offset + sg->sg_pvoffset; } KASSERT(pv != NULL, ("Can't find PV for chunk %zu", chunk_index)); cb->bio_to = pv->pv_gprov; cb->bio_done = g_llvm_done; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = pv; bioq_disksort(&bq, cb); G_LLVM_DEBUG(5, "Mapped %s(%ju, %ju) on %s to %zu(%zu,%zu) @ %s:%ju", bp->bio_cmd == BIO_READ ? "R" : "W", offset, length, lv->lv_name, chunk_index, in_chunk_offset, in_chunk_length, pv->pv_name, cb->bio_offset); addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); pv = cb->bio_caller1; cb->bio_caller1 = NULL; G_LLVM_DEBUG(6, "firing bio to %s, offset=%ju, length=%ju", cb->bio_to->name, cb->bio_offset, cb->bio_length); g_io_request(cb, pv->pv_gcons); count++; } if (count == 0) { /* We handled everything locally */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); } } static void g_llvm_remove_disk(struct g_llvm_vg *vg, struct g_consumer *cp) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int found; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); pv = (struct g_llvm_pv *)cp->private; G_LLVM_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, pv->pv_name); LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ found = 0; LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (sg->sg_pv == pv) { sg->sg_pv = NULL; lv->lv_sgactive--; found = 1; break; } } if (found) { G_LLVM_DEBUG(0, "Device %s removed.", lv->lv_gprov->name); g_wither_provider(lv->lv_gprov, ENXIO); lv->lv_gprov = NULL; } } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); } static void g_llvm_orphan(struct g_consumer *cp) { struct g_llvm_vg *vg; struct g_geom *gp; g_topology_assert(); gp = cp->geom; vg = gp->softc; if (vg == NULL) return; g_llvm_remove_disk(vg, cp); g_llvm_destroy(vg, 1); } static int g_llvm_activate_lv(struct g_llvm_vg *vg, struct g_llvm_lv *lv) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); KASSERT(lv->lv_sgactive == lv->lv_sgcount, ("segment missing")); gp = vg->vg_geom; pp = g_new_providerf(gp, "linux_lvm/%s-%s", vg->vg_name, lv->lv_name); pp->mediasize = vg->vg_extentsize * (off_t)lv->lv_extentcount; pp->sectorsize = vg->vg_sectorsize; g_error_provider(pp, 0); lv->lv_gprov = pp; pp->private = lv; G_LLVM_DEBUG(1, "Created %s, %juM", pp->name, pp->mediasize / (1024*1024)); return (0); } static int g_llvm_add_disk(struct g_llvm_vg *vg, struct g_provider *pp, char *uuid) { struct g_geom *gp; struct g_consumer *cp, *fcp; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int error; g_topology_assert(); LIST_FOREACH(pv, &vg->vg_pvs, pv_next) { if (strcmp(pv->pv_uuid, uuid) == 0) break; /* found it */ } if (pv == NULL) { G_LLVM_DEBUG(3, "uuid %s not found in pv list", uuid); return (ENOENT); } if (pv->pv_gprov != NULL) { G_LLVM_DEBUG(0, "disk %s already initialised in %s", pv->pv_name, vg->vg_name); return (EEXIST); } pv->pv_start *= vg->vg_sectorsize; gp = vg->vg_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); G_LLVM_DEBUG(1, "Attached %s to %s at offset %ju", pp->name, pv->pv_name, pv->pv_start); if (error != 0) { G_LLVM_DEBUG(0, "cannot attach %s to %s", pp->name, vg->vg_name); g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { G_LLVM_DEBUG(0, "Provider %s of %s has invalid " "sector size (%d)", pp->name, vg->vg_name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" * consumer to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } cp->private = pv; pv->pv_gcons = cp; pv->pv_gprov = pp; LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (strcmp(sg->sg_pvname, pv->pv_name) == 0) { /* avtivate the segment */ KASSERT(sg->sg_pv == NULL, ("segment already mapped")); sg->sg_pvoffset = (off_t)sg->sg_pvstart * vg->vg_extentsize + pv->pv_start; sg->sg_pv = pv; lv->lv_sgactive++; G_LLVM_DEBUG(2, "%s: %d to %d @ %s:%d" " offset %ju sector %ju", lv->lv_name, sg->sg_start, sg->sg_end, sg->sg_pvname, sg->sg_pvstart, sg->sg_pvoffset, sg->sg_pvoffset / vg->vg_sectorsize); } } /* Activate any lvs waiting on this disk */ if (lv->lv_gprov == NULL && lv->lv_sgactive == lv->lv_sgcount) { error = g_llvm_activate_lv(vg, lv); if (error) break; } } return (error); } static void g_llvm_init(struct g_class *mp) { LIST_INIT(&vg_list); } static void g_llvm_free_vg(struct g_llvm_vg *vg) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; /* Free all the structures */ while ((pv = LIST_FIRST(&vg->vg_pvs)) != NULL) { LIST_REMOVE(pv, pv_next); free(pv, M_GLLVM); } while ((lv = LIST_FIRST(&vg->vg_lvs)) != NULL) { while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } LIST_REMOVE(lv, lv_next); free(lv, M_GLLVM); } LIST_REMOVE(vg, vg_next); free(vg, M_GLLVM); } static void g_llvm_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_llvm_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_llvm_label ll; struct g_llvm_metadata md; struct g_llvm_vg *vg; int error; bzero(&md, sizeof(md)); g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); gp = g_new_geomf(mp, "linux_lvm:taste"); /* This orphan function should be never called. */ gp->orphan = g_llvm_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_llvm_read_label(cp, &ll); if (!error) error = g_llvm_read_md(cp, &md, &ll); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); vg = md.md_vg; if (vg->vg_geom == NULL) { /* new volume group */ gp = g_new_geomf(mp, "%s", vg->vg_name); gp->start = g_llvm_start; gp->spoiled = g_llvm_orphan; gp->orphan = g_llvm_orphan; gp->access = g_llvm_access; vg->vg_sectorsize = pp->sectorsize; vg->vg_extentsize *= vg->vg_sectorsize; vg->vg_geom = gp; gp->softc = vg; G_LLVM_DEBUG(1, "Created volume %s, extent size %zuK", vg->vg_name, vg->vg_extentsize / 1024); } /* initialise this disk in the volume group */ g_llvm_add_disk(vg, pp, ll.ll_uuid); return (vg->vg_geom); } static int g_llvm_destroy(struct g_llvm_vg *vg, int force) { struct g_provider *pp; struct g_geom *gp; g_topology_assert(); if (vg == NULL) return (ENXIO); gp = vg->vg_geom; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { G_LLVM_DEBUG(1, "Device %s is still open (r%dw%de%d)", pp->name, pp->acr, pp->acw, pp->ace); if (!force) return (EBUSY); } } g_llvm_free_vg(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_llvm_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_llvm_vg *vg; vg = gp->softc; return (g_llvm_destroy(vg, 0)); } int g_llvm_read_label(struct g_consumer *cp, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int i, error = 0; g_topology_assert(); /* The LVM label is stored on the first four sectors */ error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, 0, pp->sectorsize * 4, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(1, "Cannot read metadata from %s (error=%d)", pp->name, error); return (error); } /* Search the four sectors for the LVM label. */ for (i = 0; i < 4; i++) { error = llvm_label_decode(&buf[i * pp->sectorsize], ll, i); if (error == 0) break; /* found it */ } g_free(buf); return (error); } int g_llvm_read_md(struct g_consumer *cp, struct g_llvm_metadata *md, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int error; int size; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, ll->ll_md_offset, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read metadata from %s (error=%d)", cp->provider->name, error); return (error); } error = llvm_md_decode(buf, md, ll); g_free(buf); if (error != 0) { return (error); } G_LLVM_DEBUG(1, "reading LVM2 config @ %s:%ju", pp->name, ll->ll_md_offset + md->md_reloffset); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* round up to the nearest sector */ size = md->md_relsize + (pp->sectorsize - md->md_relsize % pp->sectorsize); buf = g_read_data(cp, ll->ll_md_offset + md->md_reloffset, size, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read LVM2 config from %s (error=%d)", pp->name, error); return (error); } buf[md->md_relsize] = '\0'; G_LLVM_DEBUG(10, "LVM config:\n%s\n", buf); error = llvm_textconf_decode(buf, md->md_relsize, md); g_free(buf); return (error); } static int llvm_label_decode(const u_char *data, struct g_llvm_label *ll, int sector) { uint64_t off; char *uuid; /* Magic string */ if (bcmp("LABELONE", data , 8) != 0) return (EINVAL); /* We only support LVM2 text format */ if (bcmp("LVM2 001", data + 24, 8) != 0) { G_LLVM_DEBUG(0, "Unsupported LVM format"); return (EINVAL); } ll->ll_sector = le64dec(data + 8); ll->ll_crc = le32dec(data + 16); ll->ll_offset = le32dec(data + 20); if (ll->ll_sector != sector) { G_LLVM_DEBUG(0, "Expected sector %ju, found at %d", ll->ll_sector, sector); return (EINVAL); } off = ll->ll_offset; /* * convert the binary uuid to string format, the format is * xxxxxx-xxxx-xxxx-xxxx-xxxx-xxxx-xxxxxx (6-4-4-4-4-4-6) */ uuid = ll->ll_uuid; bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '-'; for (int i = 0; i < 5; i++) { bcopy(data + off, uuid, 4); off += 4; uuid += 4; *uuid++ = '-'; } bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '\0'; ll->ll_size = le64dec(data + off); off += 8; ll->ll_pestart = le64dec(data + off); off += 16; /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one data section supported"); return (EINVAL); } off += 16; ll->ll_md_offset = le64dec(data + off); off += 8; ll->ll_md_size = le64dec(data + off); off += 8; G_LLVM_DEBUG(1, "LVM metadata: offset=%ju, size=%ju", ll->ll_md_offset, ll->ll_md_size); /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one metadata section supported"); return (EINVAL); } G_LLVM_DEBUG(2, "label uuid=%s", ll->ll_uuid); G_LLVM_DEBUG(2, "sector=%ju, crc=%u, offset=%u, size=%ju, pestart=%ju", ll->ll_sector, ll->ll_crc, ll->ll_offset, ll->ll_size, ll->ll_pestart); return (0); } static int llvm_md_decode(const u_char *data, struct g_llvm_metadata *md, struct g_llvm_label *ll) { uint64_t off; char magic[16]; off = 0; md->md_csum = le32dec(data + off); off += 4; bcopy(data + off, magic, 16); off += 16; md->md_version = le32dec(data + off); off += 4; md->md_start = le64dec(data + off); off += 8; md->md_size = le64dec(data + off); off += 8; if (bcmp(G_LLVM_MAGIC, magic, 16) != 0) { G_LLVM_DEBUG(0, "Incorrect md magic number"); return (EINVAL); } if (md->md_version != 1) { G_LLVM_DEBUG(0, "Incorrect md version number (%u)", md->md_version); return (EINVAL); } if (md->md_start != ll->ll_md_offset) { G_LLVM_DEBUG(0, "Incorrect md offset (%ju)", md->md_start); return (EINVAL); } /* Aparently only one is ever returned */ md->md_reloffset = le64dec(data + off); off += 8; md->md_relsize = le64dec(data + off); off += 16; /* XXX skipped checksum */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one reloc supported"); return (EINVAL); } G_LLVM_DEBUG(3, "reloc: offset=%ju, size=%ju", md->md_reloffset, md->md_relsize); G_LLVM_DEBUG(3, "md: version=%u, start=%ju, size=%ju", md->md_version, md->md_start, md->md_size); return (0); } #define GRAB_INT(key, tok1, tok2, v) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ v = strtol(tok2, &tok1, 10); \ if (tok1 == tok2) \ /* strtol did not eat any of the buffer */ \ goto bad; \ continue; \ } #define GRAB_STR(key, tok1, tok2, v, len) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ tok1 = strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ strncpy(v, tok1, len); \ continue; \ } #define SPLIT(key, value, str) \ key = strsep(&value, str); \ /* strip trailing whitespace on the key */ \ for (char *t = key; *t != '\0'; t++) \ if (isspace(*t)) { \ *t = '\0'; \ break; \ } static size_t llvm_grab_name(char *name, const char *tok) { size_t len; len = 0; if (tok == NULL) return (0); if (tok[0] == '-') return (0); if (strcmp(tok, ".") == 0 || strcmp(tok, "..") == 0) return (0); while (tok[len] && (isalpha(tok[len]) || isdigit(tok[len]) || tok[len] == '.' || tok[len] == '_' || tok[len] == '-' || tok[len] == '+') && len < G_LLVM_NAMELEN - 1) len++; bcopy(tok, name, len); name[len] = '\0'; return (len); } static int llvm_textconf_decode(u_char *data, int buflen, struct g_llvm_metadata *md) { struct g_llvm_vg *vg; char *buf = data; char *tok, *v; char name[G_LLVM_NAMELEN]; char uuid[G_LLVM_UUIDLEN]; size_t len; if (buf == NULL || *buf == '\0') return (EINVAL); tok = strsep(&buf, "\n"); if (tok == NULL) return (EINVAL); len = llvm_grab_name(name, tok); if (len == 0) return (EINVAL); /* check too see if the vg has already been loaded off another disk */ LIST_FOREACH(vg, &vg_list, vg_next) { if (strcmp(vg->vg_name, name) == 0) { uuid[0] = '\0'; /* grab the volume group uuid */ while ((tok = strsep(&buf, "\n")) != NULL) { if (strstr(tok, "{")) break; if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, uuid, sizeof(uuid)); } } if (strcmp(vg->vg_uuid, uuid) == 0) { /* existing vg */ md->md_vg = vg; return (0); } /* XXX different volume group with name clash! */ G_LLVM_DEBUG(0, "%s already exists, volume group not loaded", name); return (EINVAL); } } vg = malloc(sizeof(*vg), M_GLLVM, M_NOWAIT|M_ZERO); if (vg == NULL) return (ENOMEM); strncpy(vg->vg_name, name, sizeof(vg->vg_name)); LIST_INIT(&vg->vg_pvs); LIST_INIT(&vg->vg_lvs); #define VOL_FOREACH(func, tok, buf, p) \ while ((tok = strsep(buf, "\n")) != NULL) { \ if (strstr(tok, "{")) { \ func(buf, tok, p); \ continue; \ } \ if (strstr(tok, "}")) \ break; \ } while ((tok = strsep(&buf, "\n")) != NULL) { if (strcmp(tok, "physical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_pv, tok, &buf, vg); continue; } if (strcmp(tok, "logical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_lv, tok, &buf, vg); continue; } if (strstr(tok, "{")) { G_LLVM_DEBUG(2, "unknown section %s", tok); continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, vg->vg_uuid, sizeof(vg->vg_uuid)); GRAB_INT("extent_size", v, tok, vg->vg_extentsize); continue; } } /* basic checking */ if (vg->vg_extentsize == 0) goto bad; md->md_vg = vg; LIST_INSERT_HEAD(&vg_list, vg, vg_next); G_LLVM_DEBUG(3, "vg: name=%s uuid=%s", vg->vg_name, vg->vg_uuid); return(0); bad: g_llvm_free_vg(vg); return (-1); } #undef VOL_FOREACH static int llvm_textconf_decode_pv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_pv *pv; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); pv = malloc(sizeof(*pv), M_GLLVM, M_NOWAIT|M_ZERO); if (pv == NULL) return (ENOMEM); pv->pv_vg = vg; len = 0; if (tok == NULL) goto bad; len = llvm_grab_name(pv->pv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, pv->pv_uuid, sizeof(pv->pv_uuid)); GRAB_INT("pe_start", v, tok, pv->pv_start); GRAB_INT("pe_count", v, tok, pv->pv_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (pv->pv_count == 0) goto bad; LIST_INSERT_HEAD(&vg->vg_pvs, pv, pv_next); G_LLVM_DEBUG(3, "pv: name=%s uuid=%s", pv->pv_name, pv->pv_uuid); return (0); bad: free(pv, M_GLLVM); return (-1); } static int llvm_textconf_decode_lv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_lv *lv; struct g_llvm_segment *sg; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); lv = malloc(sizeof(*lv), M_GLLVM, M_NOWAIT|M_ZERO); if (lv == NULL) return (ENOMEM); lv->lv_vg = vg; LIST_INIT(&lv->lv_segs); if (tok == NULL) goto bad; len = llvm_grab_name(lv->lv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) { if (strstr(tok, "segment")) { llvm_textconf_decode_sg(buf, tok, lv); continue; } else /* unexpected section */ goto bad; } if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, lv->lv_uuid, sizeof(lv->lv_uuid)); GRAB_INT("segment_count", v, tok, lv->lv_sgcount); continue; } } if (tok == NULL) goto bad; if (lv->lv_sgcount == 0 || lv->lv_sgcount != lv->lv_numsegs) /* zero or incomplete segment list */ goto bad; /* Optimize for only one segment on the pv */ lv->lv_firstsg = LIST_FIRST(&lv->lv_segs); LIST_INSERT_HEAD(&vg->vg_lvs, lv, lv_next); G_LLVM_DEBUG(3, "lv: name=%s uuid=%s", lv->lv_name, lv->lv_uuid); return (0); bad: while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } free(lv, M_GLLVM); return (-1); } static int llvm_textconf_decode_sg(char **buf, char *tok, struct g_llvm_lv *lv) { struct g_llvm_segment *sg; char *v; int count = 0; if (*buf == NULL || **buf == '\0') return (EINVAL); sg = malloc(sizeof(*sg), M_GLLVM, M_NOWAIT|M_ZERO); if (sg == NULL) return (ENOMEM); while ((tok = strsep(buf, "\n")) != NULL) { /* only a single linear stripe is supported */ if (strstr(tok, "stripe_count")) { SPLIT(v, tok, "="); GRAB_INT("stripe_count", v, tok, count); if (count != 1) goto bad; } if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; if (strcmp(tok, "stripes = [") == 0) { tok = strsep(buf, "\n"); if (tok == NULL) goto bad; strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing open quotes */ v = strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing close quotes */ strncpy(sg->sg_pvname, v, sizeof(sg->sg_pvname)); if (*tok != ',') goto bad; /* missing comma for stripe */ tok++; sg->sg_pvstart = strtol(tok, &v, 10); if (v == tok) /* strtol did not eat any of the buffer */ goto bad; continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_INT("start_extent", v, tok, sg->sg_start); GRAB_INT("extent_count", v, tok, sg->sg_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (count != 1 || sg->sg_count == 0) goto bad; sg->sg_end = sg->sg_start + sg->sg_count - 1; lv->lv_numsegs++; lv->lv_extentcount += sg->sg_count; LIST_INSERT_HEAD(&lv->lv_segs, sg, sg_next); return (0); bad: free(sg, M_GLLVM); return (-1); } #undef GRAB_INT #undef GRAB_STR #undef SPLIT static struct g_class g_llvm_class = { .name = G_LLVM_CLASS_NAME, .version = G_VERSION, .init = g_llvm_init, .taste = g_llvm_taste, .destroy_geom = g_llvm_destroy_geom }; DECLARE_GEOM_CLASS(g_llvm_class, g_linux_lvm); MODULE_VERSION(geom_linux_lvm, 0); Index: head/sys/geom/linux_lvm/g_linux_lvm.h =================================================================== --- head/sys/geom/linux_lvm/g_linux_lvm.h (revision 350693) +++ head/sys/geom/linux_lvm/g_linux_lvm.h (revision 350694) @@ -1,115 +1,107 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#define G_LLVM_DEBUG(lvl, ...) do { \ - if (g_llvm_debug >= (lvl)) { \ - printf("GEOM_LINUX_LVM"); \ - if (g_llvm_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) +#define G_LLVM_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_LINUX_LVM", g_llvm_debug, (lvl), NULL, __VA_ARGS__) #define G_LLVM_CLASS_NAME "LINUX_LVM" #define G_LLVM_NAMELEN 128 #define G_LLVM_UUIDLEN 40 #define G_LLVM_MAGIC "\040\114\126\115\062\040\170\133" \ "\065\101\045\162\060\116\052\076" struct g_llvm_label { uint64_t ll_sector; uint32_t ll_crc; uint32_t ll_offset; char ll_uuid[G_LLVM_UUIDLEN]; uint64_t ll_size; uint64_t ll_pestart; uint64_t ll_md_offset; uint64_t ll_md_size; }; struct g_llvm_metadata { uint32_t md_csum; uint32_t md_version; uint64_t md_start; uint64_t md_size; uint64_t md_reloffset; uint64_t md_relsize; struct g_llvm_vg *md_vg; }; struct g_llvm_lv { LIST_ENTRY(g_llvm_lv) lv_next; struct g_llvm_vg *lv_vg; char lv_name[G_LLVM_NAMELEN]; char lv_uuid[G_LLVM_UUIDLEN]; int lv_sgcount; int lv_sgactive; struct g_provider *lv_gprov; int lv_extentcount; LIST_HEAD(, g_llvm_segment) lv_segs; int lv_numsegs; struct g_llvm_segment *lv_firstsg; }; struct g_llvm_pv { LIST_ENTRY(g_llvm_pv) pv_next; struct g_llvm_vg *pv_vg; char pv_name[G_LLVM_NAMELEN]; char pv_uuid[G_LLVM_UUIDLEN]; size_t pv_size; off_t pv_start; int pv_count; struct g_provider *pv_gprov; struct g_consumer *pv_gcons; }; struct g_llvm_segment { LIST_ENTRY(g_llvm_segment) sg_next; int sg_start; int sg_end; int sg_count; char sg_pvname[G_LLVM_NAMELEN]; struct g_llvm_pv *sg_pv; int sg_pvstart; off_t sg_pvoffset; }; struct g_llvm_vg { LIST_ENTRY(g_llvm_vg) vg_next; char vg_name[G_LLVM_NAMELEN]; char vg_uuid[G_LLVM_UUIDLEN]; size_t vg_extentsize; int vg_sectorsize; struct g_geom *vg_geom; LIST_HEAD(, g_llvm_pv) vg_pvs; LIST_HEAD(, g_llvm_lv) vg_lvs; }; Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 350693) +++ head/sys/geom/mirror/g_mirror.c (revision 350694) @@ -1,3572 +1,3573 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); int g_mirror_debug = 0; SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); bool g_launch_mirror_before_timeout = true; SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout, CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0, "If false, force gmirror to wait out the full kern.geom.mirror.timeout " "before launching mirrors"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_mirror_sync_period = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_update_period, CTLFLAG_RWTUN, &g_mirror_sync_period, 0, "Metadata update period during synchronization, in seconds"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static g_ctl_destroy_geom_t g_mirror_destroy_geom; static g_taste_t g_mirror_taste; static g_init_t g_mirror_init; static g_fini_t g_mirror_fini; static g_provgone_t g_mirror_providergone; static g_resize_t g_mirror_resize; struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .providergone = g_mirror_providergone, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md); static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_first(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_sync.ds_update_ts = time_uptime; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; disk->d_init_ndisks = md->md_all; disk->d_init_slice = md->md_slice; disk->d_init_balance = md->md_balance; disk->d_init_mediasize = md->md_mediasize; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); g_topology_lock(); LIST_REMOVE(disk, d_next); g_topology_unlock(); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_free_device(struct g_mirror_softc *sc) { g_topology_assert(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_destroy(&sc->sc_lock); free(sc, M_MIRROR); } static void g_mirror_providergone(struct g_provider *pp) { struct g_mirror_softc *sc = pp->private; if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_first(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); sx_xunlock(&sc->sc_lock); if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); g_topology_unlock(); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error); if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return (0); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request_error(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct bio *bp) { if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == EOPNOTSUPP) return; if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { if (bp->bio_error == ENXIO && bp->bio_cmd == BIO_READ) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; else if (bp->bio_error == ENXIO) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW; else sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } static void g_mirror_regular_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); KASSERT(sc->sc_provider == bp->bio_parent->bio_to, ("regular request %p with unexpected origin", bp)); pbp = bp->bio_parent; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } switch (bp->bio_cmd) { case BIO_READ: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read, bp->bio_error); break; case BIO_WRITE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write, bp->bio_error); break; case BIO_DELETE: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_delete, bp->bio_error); break; case BIO_FLUSH: KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_flush, bp->bio_error); break; } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) g_mirror_regular_request_error(sc, disk, bp); switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, pbp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { TAILQ_REMOVE(&sc->sc_inflight, pbp, bio_queue); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int val; sc = bp->bio_to->private; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = disk != NULL; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->private; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); if (bp->bio_to->error != 0) { mtx_unlock(&sc->sc_queue_mtx); g_io_deliver(bp, bp->bio_to->error); return; } TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static bool g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (false); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (true); } } return (false); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static bool g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (false); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (true); } return (false); } /* * Puts regular request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); TAILQ_INSERT_TAIL(&sc->sc_regular_delayed, bp, bio_queue); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); TAILQ_INSERT_TAIL(&sc->sc_sync_delayed, bp, bio_queue); } /* * Requeue delayed regular requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp; if ((bp = TAILQ_FIRST(&sc->sc_regular_delayed)) == NULL) return; if (g_mirror_sync_collision(sc, bp)) return; G_MIRROR_DEBUG(2, "Requeuing regular requests after collision."); mtx_lock(&sc->sc_queue_mtx); TAILQ_CONCAT(&sc->sc_regular_delayed, &sc->sc_queue, bio_queue); TAILQ_SWAP(&sc->sc_regular_delayed, &sc->sc_queue, bio, bio_queue); mtx_unlock(&sc->sc_queue_mtx); } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; TAILQ_REMOVE(&sc->sc_sync_delayed, bp, bio_queue); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Free a synchronization request and clear its slot in the array. */ static void g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp) { int idx; if (disk != NULL && disk->d_sync.ds_bios != NULL) { idx = (int)(uintptr_t)bp->bio_caller1; KASSERT(disk->d_sync.ds_bios[idx] == bp, ("unexpected sync BIO at %p:%d", disk, idx)); disk->d_sync.ds_bios[idx] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } /* * Handle synchronization requests. * Every synchronization request is a two-step process: first, a read request is * sent to the mirror provider via the sync consumer. If that request completes * successfully, it is converted to a write and sent to the disk being * synchronized. If the write also completes successfully, the synchronization * offset is advanced and a new read request is submitted. */ static void g_mirror_sync_request(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; KASSERT((bp->bio_cmd == BIO_READ && bp->bio_from->geom == sc->sc_sync.ds_geom) || (bp->bio_cmd == BIO_WRITE && bp->bio_from->geom == sc->sc_geom), ("Sync BIO %p with unexpected origin", bp)); bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_mirror_sync_request_free(NULL, bp); sx_xlock(&sc->sc_lock); return; } sync = &disk->d_sync; /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); /* * The read error will trigger a syncid bump, so there's * no need to do that here. * * The read error handling for regular requests will * retry the read from all active mirrors before passing * the error back up, so there's no need to retry here. */ g_mirror_sync_request_free(disk, bp); g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { off_t offset; int i; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; g_mirror_sync_request_free(disk, bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Requeue delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp != NULL && bp->bio_offset < offset) offset = bp->bio_offset; } if (g_mirror_sync_period > 0 && time_uptime - sync->ds_update_ts > g_mirror_sync_period) { sync->ds_offset_done = offset; g_mirror_update_metadata(disk); sync->ds_update_ts = time_uptime; } return; } default: panic("Invalid I/O request %p", bp); } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue queue; struct bio *cbp; struct g_consumer *cp; struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SA_XLOCKED); /* * To avoid ordering issues, if a write is deferred because of a * collision with a sync request, all I/O is deferred until that * write is initiated. */ if (bp->bio_from->geom != sc->sc_sync.ds_geom && !TAILQ_EMPTY(&sc->sc_regular_delayed)) { g_mirror_regular_delay(sc, bp); return; } switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= disk->d_sync.ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (TAILQ_EMPTY(&queue)) { KASSERT(bp->bio_cmd == BIO_DELETE, ("No consumers for regular request %p", bp)); g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ TAILQ_INSERT_TAIL(&sc->sc_inflight, bp, bio_queue); return; case BIO_FLUSH: TAILQ_INIT(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } KASSERT(!TAILQ_EMPTY(&queue), ("No consumers for regular request %p", bp)); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); TAILQ_REMOVE(&queue, cbp, bio_queue); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_first(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, true); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, false); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Handle I/O requests. */ mtx_lock(&sc->sc_queue_mtx); bp = TAILQ_FIRST(&sc->sc_queue); if (bp != NULL) TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); if (!TAILQ_EMPTY(&sc->sc_queue)) { mtx_unlock(&sc->sc_queue_mtx); continue; } } if (g_mirror_event_first(sc) != NULL) { mtx_unlock(&sc->sc_queue_mtx); continue; } sx_xunlock(&sc->sc_lock); MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { /* * Handle completion of the first half (the read) of a * block synchronization operation. */ g_mirror_sync_request(sc, bp); } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) /* * Handle completion of a regular I/O request. */ g_mirror_regular_request(sc, bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) /* * Handle completion of the second half (the * write) of a block synchronization operation. */ g_mirror_sync_request(sc, bp); else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { /* * Initiate an I/O request. */ g_mirror_register_request(sc, bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset) { void *data; int idx; data = bp->bio_data; idx = (int)(uintptr_t)bp->bio_caller1; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_data = data; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = disk->d_softc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)idx; bp->bio_offset = offset; bp->bio_length = MIN(MAXPHYS, disk->d_softc->sc_mediasize - bp->bio_offset); } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_disk_sync *sync; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sync = &disk->d_sync; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(sync->ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); sync->ds_consumer = cp; sync->ds_consumer->private = disk; sync->ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ sync->ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); sync->ds_bios[i] = bp; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_caller1 = (void *)(uintptr_t)i; g_mirror_sync_reinit(disk, bp, sync->ds_offset); sync->ds_offset += bp->bio_length; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ sync->ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_regular_release(sc); free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } pp->private = sc; sc->sc_refcnt++; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); /* * Abort any pending I/O that wasn't generated by us. * Synchronization requests and requests destined for individual * mirror components can be destroyed immediately. */ if (bp->bio_to == sc->sc_provider && bp->bio_from->geom != sc->sc_sync.ds_geom) { g_io_deliver(bp, ENXIO); } else { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } } mtx_unlock(&sc->sc_queue_mtx); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name); g_topology_unlock(); } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 && (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; const char *mismatch; uintmax_t found, newest; u_int dirty, ndisks; /* Pre-flight checks */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { /* * Confirm we already detected the newest genid. */ KASSERT(sc->sc_genid >= disk->d_genid, ("%s: found newer genid %u (sc:%p had %u).", __func__, disk->d_genid, sc, sc->sc_genid)); /* Kick out any previously tasted stale components. */ if (disk->d_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Stale 'genid' field on %s " "(device %s) (component=%u latest=%u), skipping.", g_mirror_get_diskname(disk), sc->sc_name, disk->d_genid, sc->sc_genid); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } /* * Confirm we already detected the newest syncid. */ KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid, ("%s: found newer syncid %u (sc:%p had %u).", __func__, disk->d_sync.ds_syncid, sc, sc->sc_syncid)); #define DETECT_MISMATCH(field, name) \ if (mismatch == NULL && \ disk->d_init_ ## field != sc->sc_ ## field) { \ mismatch = name; \ found = (intmax_t)disk->d_init_ ## field; \ newest = (intmax_t)sc->sc_ ## field; \ } mismatch = NULL; DETECT_MISMATCH(ndisks, "md_all"); DETECT_MISMATCH(balance, "md_balance"); DETECT_MISMATCH(slice, "md_slice"); DETECT_MISMATCH(mediasize, "md_mediasize"); #undef DETECT_MISMATCH if (mismatch != NULL) { G_MIRROR_DEBUG(0, "Found a mismatching '%s' " "field on %s (device %s) (found=%ju " "newest=%ju).", mismatch, g_mirror_get_diskname(disk), sc->sc_name, found, newest); g_mirror_destroy_disk(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; continue; } } KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? If the timeout (force is true) has expired, and * any disks are present, then yes. If we're permitted to launch * before the timeout has expired and the expected number of * current-generation mirror disks have been tasted, then yes. */ ndisks = g_mirror_ndisks(sc, -1); if ((force && ndisks > 0) || (g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No usable disks, so destroy the device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW; g_mirror_bump_syncid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; g_topology_lock(); if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } g_topology_unlock(); G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) { G_MIRROR_DEBUG(0, "Device %s: failed to clear metadata on %s: %d.", sc->sc_name, g_mirror_get_diskname(disk), error); break; } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x " "sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x " "sc_balance 0x%x sc_mediasize 0x%jx pp_mediasize 0x%jx " "md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%jx " "md_dflags 0x%jx md_syncid 0x%x md_genid 0x%x md_priority 0x%x " "sc_state 0x%x.", __func__, md->md_did, pp->name, sc->sc_name, md->md_all, sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance, sc->sc_balance, (uintmax_t)sc->sc_mediasize, (uintmax_t)pp->mediasize, md->md_sectorsize, sc->sc_sectorsize, (uintmax_t)md->md_mflags, (uintmax_t)md->md_dflags, md->md_syncid, md->md_genid, md->md_priority, sc->sc_state); if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } /* * If the component disk we're tasting has newer metadata than the * STARTING gmirror device, refresh the device from the component. */ error = g_mirror_refresh_device(sc, pp, md); if (error != 0) return (error); disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0, ("CLOSEWAIT flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } sc->sc_provider_open += acr + acw + ace; if (pp->acw + acw == 0) g_mirror_idle(sc, 0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 && sc->sc_provider_open == 0) g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_reinit_from_metadata(struct g_mirror_softc *sc, const struct g_mirror_metadata *md) { sc->sc_genid = md->md_genid; sc->sc_syncid = md->md_syncid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_ndisks = md->md_all; sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_MASK; sc->sc_flags |= (md->md_mflags & G_MIRROR_DEVICE_FLAG_MASK); } struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_type = type; sc->sc_id = md->md_mid; g_mirror_reinit_from_metadata(sc, md); sc->sc_sectorsize = md->md_sectorsize; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sc->sc_refcnt = 1; sx_init(&sc->sc_lock, "gmirror:lock"); TAILQ_INIT(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_regular_delayed); TAILQ_INIT(&sc->sc_inflight); TAILQ_INIT(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; sc->sc_provider_open = 0; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); g_destroy_geom(sc->sc_geom); g_mirror_free_device(sc); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider_open != 0) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (%d).", sc->sc_name, sc->sc_provider_open); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", sc->sc_name); } } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { sx_xunlock(&sc->sc_lock); return (0); } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_mediasize)); sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); } else { sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_MIRROR_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_MIRROR_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_cat(sb, "\n"); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; if (panicstr != NULL) return; mp = arg; g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } /* * Refresh the mirror device's metadata when gmirror encounters a newer * generation as the individual components are being added to the mirror set. */ static int g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, const struct g_mirror_metadata *md) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_genid <= md->md_genid, ("%s: attempted to refresh from stale component %s (device %s) " "(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid, sc->sc_genid)); if (sc->sc_genid > md->md_genid || (sc->sc_genid == md->md_genid && sc->sc_syncid >= md->md_syncid)) return (0); G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u " "new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; " "provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid, sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name); if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) { /* Probable data corruption detected */ G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state " "(device=%s genid=%u). A stale mirror device was launched.", g_mirror_device_state2str(sc->sc_state), sc->sc_name, sc->sc_genid); return (EINVAL); } /* Update softc */ g_mirror_reinit_from_metadata(sc, md); G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s " "(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid, g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid, md->md_syncid, (unsigned)md->md_all); return (0); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); MODULE_VERSION(geom_mirror, 0); Index: head/sys/geom/mirror/g_mirror.h =================================================================== --- head/sys/geom/mirror/g_mirror.h (revision 350693) +++ head/sys/geom/mirror/g_mirror.h (revision 350694) @@ -1,518 +1,500 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_MIRROR_H_ #define _G_MIRROR_H_ #include #include #define G_MIRROR_CLASS_NAME "MIRROR" #define G_MIRROR_MAGIC "GEOM::MIRROR" /* * Version history: * 0 - Initial version number. * 1 - Added 'prefer' balance algorithm. * 2 - Added md_genid field to metadata. * 3 - Added md_provsize field to metadata. * 4 - Added 'no failure synchronization' flag. */ #define G_MIRROR_VERSION 4 #define G_MIRROR_BALANCE_NONE 0 #define G_MIRROR_BALANCE_ROUND_ROBIN 1 #define G_MIRROR_BALANCE_LOAD 2 #define G_MIRROR_BALANCE_SPLIT 3 #define G_MIRROR_BALANCE_PREFER 4 #define G_MIRROR_BALANCE_MIN G_MIRROR_BALANCE_NONE #define G_MIRROR_BALANCE_MAX G_MIRROR_BALANCE_PREFER #define G_MIRROR_DISK_FLAG_DIRTY 0x0000000000000001ULL #define G_MIRROR_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL #define G_MIRROR_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL #define G_MIRROR_DISK_FLAG_INACTIVE 0x0000000000000008ULL #define G_MIRROR_DISK_FLAG_HARDCODED 0x0000000000000010ULL #define G_MIRROR_DISK_FLAG_BROKEN 0x0000000000000020ULL #define G_MIRROR_DISK_FLAG_CANDELETE 0x0000000000000040ULL /* Per-disk flags which are recorded in on-disk metadata. */ #define G_MIRROR_DISK_FLAG_MASK (G_MIRROR_DISK_FLAG_DIRTY | \ G_MIRROR_DISK_FLAG_SYNCHRONIZING | \ G_MIRROR_DISK_FLAG_FORCE_SYNC | \ G_MIRROR_DISK_FLAG_INACTIVE | \ G_MIRROR_DISK_FLAG_CANDELETE) #define G_MIRROR_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_MIRROR_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL /* Mirror flags which are recorded in on-disk metadata. */ #define G_MIRROR_DEVICE_FLAG_MASK (G_MIRROR_DEVICE_FLAG_NOAUTOSYNC | \ G_MIRROR_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL #define G_MIRROR_DEVICE_FLAG_DESTROY 0x0100000000000000ULL #define G_MIRROR_DEVICE_FLAG_DRAIN 0x0200000000000000ULL #define G_MIRROR_DEVICE_FLAG_CLOSEWAIT 0x0400000000000000ULL #define G_MIRROR_DEVICE_FLAG_TASTING 0x0800000000000000ULL #define G_MIRROR_DEVICE_FLAG_WIPE 0x1000000000000000ULL extern int g_mirror_debug; -#define G_MIRROR_DEBUG(lvl, ...) do { \ - if (g_mirror_debug >= (lvl)) { \ - printf("GEOM_MIRROR"); \ - if (g_mirror_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_MIRROR_LOGREQ(lvl, bp, ...) do { \ - if (g_mirror_debug >= (lvl)) { \ - printf("GEOM_MIRROR"); \ - if (g_mirror_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_MIRROR_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_MIRROR", g_mirror_debug, (lvl), NULL, __VA_ARGS__) +#define G_MIRROR_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_MIRROR", g_mirror_debug, (lvl), (bp), __VA_ARGS__) #define G_MIRROR_BIO_FLAG_REGULAR 0x01 #define G_MIRROR_BIO_FLAG_SYNC 0x02 /* * Informations needed for synchronization. */ struct g_mirror_disk_sync { struct g_consumer *ds_consumer; /* Consumer connected to our mirror. */ off_t ds_offset; /* Offset of next request to send. */ off_t ds_offset_done; /* Offset of already synchronized region. */ time_t ds_update_ts; /* Time of last metadata update. */ u_int ds_syncid; /* Disk's synchronization ID. */ u_int ds_inflight; /* Number of in-flight sync requests. */ struct bio **ds_bios; /* BIOs for synchronization I/O. */ }; /* * Informations needed for synchronization. */ struct g_mirror_device_sync { struct g_geom *ds_geom; /* Synchronization geom. */ u_int ds_ndisks; /* Number of disks in SYNCHRONIZING state. */ }; #define G_MIRROR_DISK_STATE_NONE 0 #define G_MIRROR_DISK_STATE_NEW 1 #define G_MIRROR_DISK_STATE_ACTIVE 2 #define G_MIRROR_DISK_STATE_STALE 3 #define G_MIRROR_DISK_STATE_SYNCHRONIZING 4 #define G_MIRROR_DISK_STATE_DISCONNECTED 5 #define G_MIRROR_DISK_STATE_DESTROY 6 struct g_mirror_disk { uint32_t d_id; /* Disk ID. */ struct g_consumer *d_consumer; /* Consumer. */ struct g_mirror_softc *d_softc; /* Back-pointer to softc. */ int d_state; /* Disk state. */ u_int d_priority; /* Disk priority. */ u_int load; /* Averaged queue length */ off_t d_last_offset; /* Last read offset */ uint64_t d_flags; /* Additional flags. */ u_int d_genid; /* Disk's generation ID. */ struct g_mirror_disk_sync d_sync;/* Sync information. */ LIST_ENTRY(g_mirror_disk) d_next; u_int d_init_ndisks; /* Initial number of mirror components */ uint32_t d_init_slice; /* Initial slice size */ uint8_t d_init_balance;/* Initial balance */ uint64_t d_init_mediasize;/* Initial mediasize */ }; #define d_name d_consumer->provider->name #define G_MIRROR_EVENT_DONTWAIT 0x1 #define G_MIRROR_EVENT_WAIT 0x2 #define G_MIRROR_EVENT_DEVICE 0x4 #define G_MIRROR_EVENT_DONE 0x8 struct g_mirror_event { struct g_mirror_disk *e_disk; int e_state; int e_flags; int e_error; TAILQ_ENTRY(g_mirror_event) e_next; }; #define G_MIRROR_DEVICE_STATE_STARTING 0 #define G_MIRROR_DEVICE_STATE_RUNNING 1 #define G_MIRROR_TYPE_MANUAL 0 #define G_MIRROR_TYPE_AUTOMATIC 1 /* Bump syncid on first write. */ #define G_MIRROR_BUMP_SYNCID 0x1 /* Bump genid immediately. */ #define G_MIRROR_BUMP_GENID 0x2 /* Bump syncid immediately. */ #define G_MIRROR_BUMP_SYNCID_NOW 0x4 struct g_mirror_softc { u_int sc_type; /* Device type (manual/automatic). */ u_int sc_state; /* Device state. */ uint32_t sc_slice; /* Slice size. */ uint8_t sc_balance; /* Balance algorithm. */ uint64_t sc_mediasize; /* Device size. */ uint32_t sc_sectorsize; /* Sector size. */ uint64_t sc_flags; /* Additional flags. */ struct g_geom *sc_geom; struct g_provider *sc_provider; int sc_provider_open; uint32_t sc_id; /* Mirror unique ID. */ struct sx sc_lock; struct bio_queue sc_queue; struct mtx sc_queue_mtx; struct proc *sc_worker; struct bio_queue sc_inflight; /* In-flight regular write requests. */ struct bio_queue sc_regular_delayed; /* Delayed I/O requests due to collision with sync requests. */ struct bio_queue sc_sync_delayed; /* Delayed sync requests due to collision with regular requests. */ LIST_HEAD(, g_mirror_disk) sc_disks; u_int sc_ndisks; /* Number of disks. */ struct g_mirror_disk *sc_hint; u_int sc_genid; /* Generation ID. */ u_int sc_syncid; /* Synchronization ID. */ int sc_bump_id; struct g_mirror_device_sync sc_sync; int sc_idle; /* DIRTY flags removed. */ time_t sc_last_write; u_int sc_writes; u_int sc_refcnt; /* Number of softc references */ TAILQ_HEAD(, g_mirror_event) sc_events; struct mtx sc_events_mtx; struct callout sc_callout; struct root_hold_token *sc_rootmount; struct mtx sc_done_mtx; }; #define sc_name sc_geom->name struct g_mirror_metadata; u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state); struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type); #define G_MIRROR_DESTROY_SOFT 0 #define G_MIRROR_DESTROY_DELAYED 1 #define G_MIRROR_DESTROY_HARD 2 int g_mirror_destroy(struct g_mirror_softc *sc, int how); int g_mirror_event_send(void *arg, int state, int flags); struct g_mirror_metadata; int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md); int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md); void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md); void g_mirror_update_metadata(struct g_mirror_disk *disk); g_ctl_req_t g_mirror_config; #endif /* _KERNEL */ struct g_mirror_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Mirror name. */ uint32_t md_mid; /* Mirror unique ID. */ uint32_t md_did; /* Disk unique ID. */ uint8_t md_all; /* Number of disks in mirror. */ uint32_t md_genid; /* Generation ID. */ uint32_t md_syncid; /* Synchronization ID. */ uint8_t md_priority; /* Disk priority. */ uint32_t md_slice; /* Slice size. */ uint8_t md_balance; /* Balance type. */ uint64_t md_mediasize; /* Size of the smallest disk in mirror. */ uint32_t md_sectorsize; /* Sector size. */ uint64_t md_sync_offset; /* Synchronized offset. */ uint64_t md_mflags; /* Additional mirror flags. */ uint64_t md_dflags; /* Additional disk flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void mirror_metadata_encode(struct g_mirror_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, 16); le32enc(data + 36, md->md_mid); le32enc(data + 40, md->md_did); *(data + 44) = md->md_all; le32enc(data + 45, md->md_genid); le32enc(data + 49, md->md_syncid); *(data + 53) = md->md_priority; le32enc(data + 54, md->md_slice); *(data + 58) = md->md_balance; le64enc(data + 59, md->md_mediasize); le32enc(data + 67, md->md_sectorsize); le64enc(data + 71, md->md_sync_offset); le64enc(data + 79, md->md_mflags); le64enc(data + 87, md->md_dflags); bcopy(md->md_provider, data + 95, 16); le64enc(data + 111, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 119, 16); } static __inline int mirror_metadata_decode_v0v1(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_syncid = le32dec(data + 45); md->md_priority = *(data + 49); md->md_slice = le32dec(data + 50); md->md_balance = *(data + 54); md->md_mediasize = le64dec(data + 55); md->md_sectorsize = le32dec(data + 63); md->md_sync_offset = le64dec(data + 67); md->md_mflags = le64dec(data + 75); md->md_dflags = le64dec(data + 83); bcopy(data + 91, md->md_provider, 16); bcopy(data + 107, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 107); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 107, 16) != 0) return (EINVAL); /* New fields. */ md->md_genid = 0; md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v2(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); bcopy(data + 111, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 111); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 111, 16) != 0) return (EINVAL); /* New fields. */ md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v3v4(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); md->md_provsize = le64dec(data + 111); bcopy(data + 119, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 119, 16) != 0) return (EINVAL); return (0); } static __inline int mirror_metadata_decode(const u_char *data, struct g_mirror_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: case 1: error = mirror_metadata_decode_v0v1(data, md); break; case 2: error = mirror_metadata_decode_v2(data, md); break; case 3: case 4: error = mirror_metadata_decode_v3v4(data, md); break; default: error = EINVAL; break; } return (error); } static __inline const char * balance_name(u_int balance) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer", [G_MIRROR_BALANCE_MAX + 1] = "unknown" }; if (balance > G_MIRROR_BALANCE_MAX) balance = G_MIRROR_BALANCE_MAX + 1; return (algorithms[balance]); } static __inline int balance_id(const char *name) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer" }; int n; for (n = G_MIRROR_BALANCE_MIN; n <= G_MIRROR_BALANCE_MAX; n++) { if (strcmp(name, algorithms[n]) == 0) return (n); } return (-1); } static __inline void mirror_metadata_dump(const struct g_mirror_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" name: %s\n", md->md_name); printf(" mid: %u\n", (u_int)md->md_mid); printf(" did: %u\n", (u_int)md->md_did); printf(" all: %u\n", (u_int)md->md_all); printf(" genid: %u\n", (u_int)md->md_genid); printf(" syncid: %u\n", (u_int)md->md_syncid); printf(" priority: %u\n", (u_int)md->md_priority); printf(" slice: %u\n", (u_int)md->md_slice); printf(" balance: %s\n", balance_name((u_int)md->md_balance)); printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); printf(" mflags:"); if (md->md_mflags == 0) printf(" NONE"); else { if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) printf(" NOFAILSYNC"); if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) printf(" NOAUTOSYNC"); } printf("\n"); printf(" dflags:"); if (md->md_dflags == 0) printf(" NONE"); else { if ((md->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) != 0) printf(" DIRTY"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) printf(" SYNCHRONIZING"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) printf(" FORCE_SYNC"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) printf(" INACTIVE"); } printf("\n"); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_MIRROR_H_ */ Index: head/sys/geom/mirror/g_mirror_ctl.c =================================================================== --- head/sys/geom/mirror/g_mirror_ctl.c (revision 350693) +++ head/sys/geom/mirror/g_mirror_ctl.c (revision 350694) @@ -1,1090 +1,1091 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2009 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #include static struct g_mirror_softc * g_mirror_find_device(struct g_class *mp, const char *name) { struct g_mirror_softc *sc; struct g_geom *gp; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) continue; if (strcmp(gp->name, name) == 0 || strcmp(sc->sc_name, name) == 0) { g_topology_unlock(); sx_xlock(&sc->sc_lock); return (sc); } } g_topology_unlock(); return (NULL); } static struct g_mirror_disk * g_mirror_find_disk(struct g_mirror_softc *sc, const char *name) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); if (strncmp(name, "/dev/", 5) == 0) name += 5; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer == NULL) continue; if (disk->d_consumer->provider == NULL) continue; if (strcmp(disk->d_consumer->provider->name, name) == 0) return (disk); } return (NULL); } static void g_mirror_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name, *balancep, *prov; intmax_t *slicep, *priority; uint32_t slice; uint8_t balance; int *autosync, *noautosync, *failsync, *nofailsync, *hardcode, *dynamic; int *nargs, do_sync = 0, dirty = 1, do_priority = 0; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1 && *nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } balancep = gctl_get_asciiparam(req, "balance"); if (balancep == NULL) { gctl_error(req, "No '%s' argument.", "balance"); return; } autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync)); if (autosync == NULL) { gctl_error(req, "No '%s' argument.", "autosync"); return; } noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); if (noautosync == NULL) { gctl_error(req, "No '%s' argument.", "noautosync"); return; } failsync = gctl_get_paraml(req, "failsync", sizeof(*failsync)); if (failsync == NULL) { gctl_error(req, "No '%s' argument.", "failsync"); return; } nofailsync = gctl_get_paraml(req, "nofailsync", sizeof(*nofailsync)); if (nofailsync == NULL) { gctl_error(req, "No '%s' argument.", "nofailsync"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } dynamic = gctl_get_paraml(req, "dynamic", sizeof(*dynamic)); if (dynamic == NULL) { gctl_error(req, "No '%s' argument.", "dynamic"); return; } priority = gctl_get_paraml(req, "priority", sizeof(*priority)); if (priority == NULL) { gctl_error(req, "No '%s' argument.", "priority"); return; } if (*priority < -1 || *priority > 255) { gctl_error(req, "Priority range is 0 to 255, %jd given", *priority); return; } /* * Since we have a priority, we also need a provider now. * Note: be WARNS safe, by always assigning prov and only throw an * error if *priority != -1. */ prov = gctl_get_asciiparam(req, "arg1"); if (*priority > -1) { if (prov == NULL) { gctl_error(req, "Priority needs a disk name"); return; } do_priority = 1; } if (*autosync && *noautosync) { gctl_error(req, "'%s' and '%s' specified.", "autosync", "noautosync"); return; } if (*failsync && *nofailsync) { gctl_error(req, "'%s' and '%s' specified.", "failsync", "nofailsync"); return; } if (*hardcode && *dynamic) { gctl_error(req, "'%s' and '%s' specified.", "hardcode", "dynamic"); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (*balancep == '\0') balance = sc->sc_balance; else { if (balance_id(balancep) == -1) { gctl_error(req, "Invalid balance algorithm."); sx_xunlock(&sc->sc_lock); return; } balance = balance_id(balancep); } slicep = gctl_get_paraml(req, "slice", sizeof(*slicep)); if (slicep == NULL) { gctl_error(req, "No '%s' argument.", "slice"); sx_xunlock(&sc->sc_lock); return; } if (*slicep == -1) slice = sc->sc_slice; else slice = *slicep; /* Enforce usage() of -p not allowing any other options. */ if (do_priority && (*autosync || *noautosync || *failsync || *nofailsync || *hardcode || *dynamic || *slicep != -1 || *balancep != '\0')) { sx_xunlock(&sc->sc_lock); gctl_error(req, "only -p accepted when setting priority"); return; } if (sc->sc_balance == balance && sc->sc_slice == slice && !*autosync && !*noautosync && !*failsync && !*nofailsync && !*hardcode && !*dynamic && !do_priority) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Nothing has changed."); return; } if ((!do_priority && *nargs != 1) || (do_priority && *nargs != 2)) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid number of arguments."); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Not all disks connected. Try 'forget' command " "first."); return; } sc->sc_balance = balance; sc->sc_slice = slice; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) { if (*autosync) { sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; do_sync = 1; } } else { if (*noautosync) sc->sc_flags |= G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) { if (*failsync) sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_NOFAILSYNC; } else { if (*nofailsync) { sc->sc_flags |= G_MIRROR_DEVICE_FLAG_NOFAILSYNC; dirty = 0; } } LIST_FOREACH(disk, &sc->sc_disks, d_next) { /* * Handle priority first, since we only need one disk, do one * operation on it and then we're done. No need to check other * flags, as usage doesn't allow it. */ if (do_priority) { if (strcmp(disk->d_name, prov) == 0) { if (disk->d_priority == *priority) gctl_error(req, "Nothing has changed."); else { disk->d_priority = *priority; g_mirror_update_metadata(disk); } break; } continue; } if (do_sync) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; } if (*hardcode) disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; else if (*dynamic) disk->d_flags &= ~G_MIRROR_DISK_FLAG_HARDCODED; if (!dirty) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); if (do_sync) { if (disk->d_state == G_MIRROR_DISK_STATE_STALE) { g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } } sx_xunlock(&sc->sc_lock); } static void g_mirror_create_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while creating %s.", __func__, cp->provider->name)); } static void g_mirror_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_mirror_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct g_mirror_softc *sc; struct sbuf *sb; const char *name; char param[16]; int *nargs; intmax_t *val; int *ival; const char *sval; int bal; unsigned attached, no, sectorsize; off_t mediasize; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_MIRROR_MAGIC, sizeof(md.md_magic)); md.md_version = G_MIRROR_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_mid = arc4random(); md.md_all = *nargs - 1; md.md_genid = 0; md.md_syncid = 1; md.md_sync_offset = 0; val = gctl_get_paraml(req, "slice", sizeof(*val)); if (val == NULL) { gctl_error(req, "No slice argument."); return; } md.md_slice = *val; sval = gctl_get_asciiparam(req, "balance"); if (sval == NULL) { gctl_error(req, "No balance argument."); return; } bal = balance_id(sval); if (bal < 0) { gctl_error(req, "Invalid balance algorithm."); return; } md.md_balance = bal; md.md_mflags = 0; md.md_dflags = 0; ival = gctl_get_paraml(req, "noautosync", sizeof(*ival)); if (ival != NULL && *ival) md.md_mflags |= G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; ival = gctl_get_paraml(req, "nofailsync", sizeof(*ival)); if (ival != NULL && *ival) md.md_mflags |= G_MIRROR_DEVICE_FLAG_NOFAILSYNC; /* These fields not used in manual mode. */ bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = 0; g_topology_lock(); mediasize = OFF_MAX; sectorsize = 0; gp = g_new_geomf(mp, "%s", md.md_name); gp->orphan = g_mirror_create_orphan; cp = g_new_consumer(gp); for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); err: g_destroy_consumer(cp); g_destroy_geom(gp); g_topology_unlock(); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MIRROR_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); goto err; } g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) { G_MIRROR_DEBUG(1, "Can't open disk %s.", name); gctl_error(req, "Can't open disk %s.", name); err2: g_detach(cp); goto err; } if (pp->mediasize == 0 || pp->sectorsize == 0) { G_MIRROR_DEBUG(1, "Disk %s has no media.", name); gctl_error(req, "Disk %s has no media.", name); g_access(cp, -1, 0, 0); goto err2; } if (pp->mediasize < mediasize) mediasize = pp->mediasize; if (pp->sectorsize > sectorsize) sectorsize = pp->sectorsize; g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); md.md_mediasize = mediasize; md.md_sectorsize = sectorsize; md.md_mediasize -= (md.md_mediasize % md.md_sectorsize); gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); g_topology_unlock(); return; } sc = gp->softc; g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MIRROR_DEBUG(1, "Provider %s disappear?!", name); sbuf_printf(sb, " %s", name); continue; } md.md_did = arc4random(); md.md_priority = no - 1; if (g_mirror_add_disk(sc, pp, &md) != 0) { G_MIRROR_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if (md.md_all != attached || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_HARD); gctl_error(req, "%s", sbuf_data(sb)); } else sx_xunlock(&sc->sc_lock); sbuf_delete(sb); } static void g_mirror_ctl_rebuild(struct gctl_req *req, struct g_class *mp) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_provider *pp; const char *name; char param[16]; int error, *nargs; u_int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1 && disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { /* * This is the last active disk. There will be nothing * to rebuild it from, so deny this request. */ gctl_error(req, "Provider %s is the last active provider in %s.", name, sc->sc_geom->name); break; } /* * Do rebuild by resetting syncid, disconnecting the disk and * connecting it again. */ disk->d_sync.ds_syncid = 0; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_update_metadata(disk); pp = disk->d_consumer->provider; g_topology_lock(); error = g_mirror_read_metadata(disk->d_consumer, &md); g_topology_unlock(); g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_WAIT); if (error != 0) { gctl_error(req, "Cannot read metadata from %s.", pp->name); continue; } error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { gctl_error(req, "Cannot reconnect component %s.", pp->name); continue; } } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_insert(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_metadata md; struct g_provider *pp; struct g_consumer *cp; intmax_t *priority; const char *name; char param[16]; u_char *sector; u_int i, n; int error, *nargs, *hardcode, *inactive; struct { struct g_provider *provider; struct g_consumer *consumer; } *disks; off_t mdsize; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } priority = gctl_get_paraml(req, "priority", sizeof(*priority)); if (priority == NULL) { gctl_error(req, "No '%s' argument.", "priority"); return; } inactive = gctl_get_paraml(req, "inactive", sizeof(*inactive)); if (inactive == NULL) { gctl_error(req, "No '%s' argument.", "inactive"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { gctl_error(req, "Not all disks connected."); sx_xunlock(&sc->sc_lock); return; } disks = g_malloc(sizeof(*disks) * (*nargs), M_WAITOK | M_ZERO); g_topology_lock(); for (i = 1, n = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } if (g_mirror_find_disk(sc, name) != NULL) { gctl_error(req, "Provider %s already inserted.", name); continue; } if (strncmp(name, "/dev/", 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Unknown provider %s.", name); continue; } cp = g_new_consumer(sc->sc_geom); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); gctl_error(req, "Cannot attach to provider %s.", name); continue; } if (g_access(cp, 0, 1, 1) != 0) { gctl_error(req, "Cannot access provider %s.", name); err: g_detach(cp); g_destroy_consumer(cp); continue; } mdsize = (sc->sc_type == G_MIRROR_TYPE_AUTOMATIC) ? pp->sectorsize : 0; if (sc->sc_provider->mediasize > pp->mediasize - mdsize) { gctl_error(req, "Provider %s too small.", name); err2: g_access(cp, 0, -1, -1); goto err; } if ((sc->sc_provider->sectorsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid sectorsize of provider %s.", name); goto err2; } if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) { g_access(cp, 0, -1, -1); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); sc->sc_ndisks++; g_mirror_fill_metadata(sc, NULL, &md); md.md_priority = *priority; if (*inactive) md.md_dflags |= G_MIRROR_DISK_FLAG_INACTIVE; if (g_mirror_add_disk(sc, pp, &md) != 0) { sc->sc_ndisks--; gctl_error(req, "Disk %s not inserted.", name); } g_topology_lock(); continue; } disks[n].provider = pp; disks[n].consumer = cp; n++; } if (n == 0) { g_topology_unlock(); sx_xunlock(&sc->sc_lock); g_free(disks); return; } sc->sc_ndisks += n; again: for (i = 0; i < n; i++) { if (disks[i].consumer == NULL) continue; g_mirror_fill_metadata(sc, NULL, &md); md.md_priority = *priority; if (*inactive) md.md_dflags |= G_MIRROR_DISK_FLAG_INACTIVE; pp = disks[i].provider; if (*hardcode) { strlcpy(md.md_provider, pp->name, sizeof(md.md_provider)); } else { bzero(md.md_provider, sizeof(md.md_provider)); } md.md_provsize = pp->mediasize; sector = g_malloc(pp->sectorsize, M_WAITOK); mirror_metadata_encode(&md, sector); error = g_write_data(disks[i].consumer, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); g_free(sector); if (error != 0) { gctl_error(req, "Cannot store metadata on %s.", pp->name); g_access(disks[i].consumer, 0, -1, -1); g_detach(disks[i].consumer); g_destroy_consumer(disks[i].consumer); disks[i].consumer = NULL; disks[i].provider = NULL; sc->sc_ndisks--; goto again; } } g_topology_unlock(); if (i == 0) { /* All writes failed. */ sx_xunlock(&sc->sc_lock); g_free(disks); return; } LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } /* * Release provider and wait for retaste. */ g_topology_lock(); for (i = 0; i < n; i++) { if (disks[i].consumer == NULL) continue; g_access(disks[i].consumer, 0, -1, -1); g_detach(disks[i].consumer); g_destroy_consumer(disks[i].consumer); } g_topology_unlock(); sx_xunlock(&sc->sc_lock); g_free(disks); } static void g_mirror_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i, active; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Not all disks connected. Try 'forget' command " "first."); return; } active = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { if (active > 1) active--; else { gctl_error(req, "%s: Can't remove the last " "ACTIVE component %s.", sc->sc_geom->name, name); continue; } } g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DESTROY, G_MIRROR_EVENT_DONTWAIT); } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_resize(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; uint64_t mediasize; const char *name, *s; char *x; int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } s = gctl_get_asciiparam(req, "size"); if (s == NULL) { gctl_error(req, "No '%s' argument.", "size"); return; } mediasize = strtouq(s, &x, 0); if (*x != '\0' || mediasize == 0) { gctl_error(req, "Invalid '%s' argument.", "size"); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } /* Deny shrinking of an opened provider */ if ((g_debugflags & 16) == 0 && sc->sc_provider_open > 0) { if (sc->sc_mediasize > mediasize) { gctl_error(req, "Device %s is busy.", sc->sc_provider->name); sx_xunlock(&sc->sc_lock); return; } } LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (mediasize > disk->d_consumer->provider->mediasize - disk->d_consumer->provider->sectorsize) { gctl_error(req, "Provider %s is too small.", disk->d_name); sx_xunlock(&sc->sc_lock); return; } } /* Update the size. */ sc->sc_mediasize = mediasize; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } g_topology_lock(); g_resize_provider(sc->sc_provider, mediasize); g_topology_unlock(); sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_deactivate(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i, active; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } active = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { if (active > 1) active--; else { gctl_error(req, "%s: Can't deactivate the " "last ACTIVE component %s.", sc->sc_geom->name, name); continue; } } disk->d_flags |= G_MIRROR_DISK_FLAG_INACTIVE; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_update_metadata(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_forget(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) == sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); G_MIRROR_DEBUG(1, "All disks connected in %s, skipping.", sc->sc_name); continue; } sc->sc_ndisks = g_mirror_ndisks(sc, -1); LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } sx_xunlock(&sc->sc_lock); } } static void g_mirror_ctl_stop(struct gctl_req *req, struct g_class *mp, int wipe) { struct g_mirror_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; int how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } if (*force) how = G_MIRROR_DESTROY_HARD; else how = G_MIRROR_DESTROY_SOFT; for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } g_cancel_event(sc); if (wipe) sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WIPE; error = g_mirror_destroy(sc, how); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_geom->name, error); if (wipe) sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_WIPE; sx_xunlock(&sc->sc_lock); return; } /* No need to unlock, because lock is already dead. */ } } void g_mirror_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_MIRROR_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } g_topology_unlock(); if (strcmp(verb, "configure") == 0) g_mirror_ctl_configure(req, mp); else if (strcmp(verb, "create") == 0) g_mirror_ctl_create(req, mp); else if (strcmp(verb, "rebuild") == 0) g_mirror_ctl_rebuild(req, mp); else if (strcmp(verb, "insert") == 0) g_mirror_ctl_insert(req, mp); else if (strcmp(verb, "remove") == 0) g_mirror_ctl_remove(req, mp); else if (strcmp(verb, "resize") == 0) g_mirror_ctl_resize(req, mp); else if (strcmp(verb, "deactivate") == 0) g_mirror_ctl_deactivate(req, mp); else if (strcmp(verb, "forget") == 0) g_mirror_ctl_forget(req, mp); else if (strcmp(verb, "stop") == 0) g_mirror_ctl_stop(req, mp, 0); else if (strcmp(verb, "destroy") == 0) g_mirror_ctl_stop(req, mp, 1); else gctl_error(req, "Unknown verb."); g_topology_lock(); } Index: head/sys/geom/mountver/g_mountver.c =================================================================== --- head/sys/geom/mountver/g_mountver.c (revision 350693) +++ head/sys/geom/mountver/g_mountver.c (revision 350694) @@ -1,663 +1,664 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Edward Tomasz Napierala * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mountver, CTLFLAG_RW, 0, "GEOM_MOUNTVER stuff"); static u_int g_mountver_debug = 0; static u_int g_mountver_check_ident = 1; SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, debug, CTLFLAG_RW, &g_mountver_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, check_ident, CTLFLAG_RW, &g_mountver_check_ident, 0, "Check disk ident when reattaching"); static eventhandler_tag g_mountver_pre_sync = NULL; static void g_mountver_queue(struct bio *bp); static void g_mountver_orphan(struct g_consumer *cp); static void g_mountver_resize(struct g_consumer *cp); static int g_mountver_destroy(struct g_geom *gp, boolean_t force); static g_taste_t g_mountver_taste; static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb); static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mountver_init(struct g_class *mp); static void g_mountver_fini(struct g_class *mp); struct g_class g_mountver_class = { .name = G_MOUNTVER_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mountver_config, .taste = g_mountver_taste, .destroy_geom = g_mountver_destroy_geom, .init = g_mountver_init, .fini = g_mountver_fini }; static void g_mountver_done(struct bio *bp) { struct g_geom *gp; struct bio *pbp; if (bp->bio_error != ENXIO) { g_std_done(bp); return; } /* * When the device goes away, it's possible that few requests * will be completed with ENXIO before g_mountver_orphan() * gets called. To work around that, we have to queue requests * that failed with ENXIO, in order to send them later. */ gp = bp->bio_from->geom; pbp = bp->bio_parent; KASSERT(pbp->bio_to == LIST_FIRST(&gp->provider), ("parent request was for someone else")); g_destroy_bio(bp); pbp->bio_inbed++; g_mountver_queue(pbp); } static void g_mountver_send(struct bio *bp) { struct g_geom *gp; struct bio *cbp; gp = bp->bio_to->geom; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_mountver_done; g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_mountver_queue(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; mtx_lock(&sc->sc_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_mtx); } static void g_mountver_send_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); G_MOUNTVER_LOGREQ(bp, "Sending queued request."); g_mountver_send(bp); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_discard_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); G_MOUNTVER_LOGREQ(bp, "Discarding queued request."); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_start(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; G_MOUNTVER_LOGREQ(bp, "Request received."); /* * It is possible that some bios were returned with ENXIO, even though * orphaning didn't happen yet. In that case, queue all subsequent * requests in order to maintain ordering. */ if (sc->sc_orphaned || !TAILQ_EMPTY(&sc->sc_queue)) { if (sc->sc_shutting_down) { G_MOUNTVER_LOGREQ(bp, "Discarding request due to shutdown."); g_io_deliver(bp, ENXIO); return; } G_MOUNTVER_LOGREQ(bp, "Queueing request."); g_mountver_queue(bp); if (!sc->sc_orphaned) g_mountver_send_queued(gp); } else { G_MOUNTVER_LOGREQ(bp, "Sending request."); g_mountver_send(bp); } } static int g_mountver_access(struct g_provider *pp, int dr, int dw, int de) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = pp->geom; cp = LIST_FIRST(&gp->consumer); sc = gp->softc; if (sc == NULL && dr <= 0 && dw <= 0 && de <= 0) return (0); KASSERT(sc != NULL, ("Trying to access withered provider \"%s\".", pp->name)); sc->sc_access_r += dr; sc->sc_access_w += dw; sc->sc_access_e += de; if (sc->sc_orphaned) return (0); return (g_access(cp, dr, dw, de)); } static int g_mountver_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; char name[64]; int error; int identsize = DISK_IDENT_SIZE; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; snprintf(name, sizeof(name), "%s%s", pp->name, G_MOUNTVER_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "gmountver", NULL, MTX_DEF | MTX_RECURSE); TAILQ_INIT(&sc->sc_queue); sc->sc_provider_name = strdup(pp->name, M_GEOM); gp->softc = sc; gp->start = g_mountver_start; gp->orphan = g_mountver_orphan; gp->resize = g_mountver_resize; gp->access = g_mountver_access; gp->dumpconf = g_mountver_dumpconf; newpp = g_new_providerf(gp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0) { G_MOUNTVER_DEBUG(0, "Unmapped supported for %s.", gp->name); newpp->flags |= G_PF_ACCEPT_UNMAPPED; } else { G_MOUNTVER_DEBUG(0, "Unmapped unsupported for %s.", gp->name); newpp->flags &= ~G_PF_ACCEPT_UNMAPPED; } cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } error = g_access(cp, 1, 0, 0); if (error != 0) { gctl_error(req, "Cannot access provider %s.", pp->name); goto fail; } error = g_io_getattr("GEOM::ident", cp, &identsize, sc->sc_ident); g_access(cp, -1, 0, 0); if (error != 0) { if (g_mountver_check_ident) { gctl_error(req, "Cannot get disk ident from %s; error = %d.", pp->name, error); goto fail; } G_MOUNTVER_DEBUG(0, "Cannot get disk ident from %s; error = %d.", pp->name, error); sc->sc_ident[0] = '\0'; } g_error_provider(newpp, 0); G_MOUNTVER_DEBUG(0, "Device %s created.", gp->name); return (0); fail: g_free(sc->sc_provider_name); if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); g_free(gp->softc); g_destroy_geom(gp); return (error); } static int g_mountver_destroy(struct g_geom *gp, boolean_t force) { struct g_mountver_softc *sc; struct g_provider *pp; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_MOUNTVER_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_MOUNTVER_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_MOUNTVER_DEBUG(0, "Device %s removed.", gp->name); } if (pp != NULL) g_wither_provider(pp, ENXIO); g_mountver_discard_queued(gp); g_free(sc->sc_provider_name); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_mountver_destroy(gp, 0)); } static void g_mountver_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MOUNTVER_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } if (g_mountver_create(req, mp, pp) != 0) return; } } static struct g_geom * g_mountver_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_mountver_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); gp = g_mountver_find_geom(mp, name); if (gp == NULL) { G_MOUNTVER_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_mountver_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_mountver_orphan(struct g_consumer *cp) { struct g_mountver_softc *sc; g_topology_assert(); sc = cp->geom->softc; sc->sc_orphaned = 1; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); G_MOUNTVER_DEBUG(0, "%s is offline. Mount verification in progress.", sc->sc_provider_name); } static void g_mountver_resize(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, cp->provider->mediasize); } static int g_mountver_ident_matches(struct g_geom *gp) { struct g_consumer *cp; struct g_mountver_softc *sc; char ident[DISK_IDENT_SIZE]; int error, identsize = DISK_IDENT_SIZE; sc = gp->softc; cp = LIST_FIRST(&gp->consumer); if (g_mountver_check_ident == 0) return (0); error = g_access(cp, 1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; " "not attaching; error = %d.", gp->name, error); return (1); } error = g_io_getattr("GEOM::ident", cp, &identsize, ident); g_access(cp, -1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot get disk ident for %s; " "not attaching; error = %d.", gp->name, error); return (1); } if (strcmp(ident, sc->sc_ident) != 0) { G_MOUNTVER_DEBUG(1, "Disk ident for %s (\"%s\") is different " "from expected \"%s\", not attaching.", gp->name, ident, sc->sc_ident); return (1); } return (0); } static struct g_geom * g_mountver_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mountver_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MOUNTVER_DEBUG(2, "Tasting %s.", pp->name); /* * Let's check if device already exists. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; /* Already attached? */ if (pp == LIST_FIRST(&gp->provider)) return (NULL); if (sc->sc_orphaned && strcmp(pp->name, sc->sc_provider_name) == 0) break; } if (gp == NULL) return (NULL); cp = LIST_FIRST(&gp->consumer); g_attach(cp, pp); error = g_mountver_ident_matches(gp); if (error != 0) { g_detach(cp); return (NULL); } if (sc->sc_access_r > 0 || sc->sc_access_w > 0 || sc->sc_access_e > 0) { error = g_access(cp, sc->sc_access_r, sc->sc_access_w, sc->sc_access_e); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; error = %d.", pp->name, error); g_detach(cp); return (NULL); } } g_mountver_send_queued(gp); sc->sc_orphaned = 0; G_MOUNTVER_DEBUG(0, "%s has completed mount verification.", sc->sc_provider_name); return (gp); } static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_MOUNTVER_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_mountver_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_mountver_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mountver_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%s\n", indent, sc->sc_orphaned ? "OFFLINE" : "ONLINE"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_provider_name); sbuf_printf(sb, "%s%s\n", indent, sc->sc_ident); } static void g_mountver_shutdown_pre_sync(void *arg, int howto) { struct g_mountver_softc *sc; struct g_class *mp; struct g_geom *gp, *gp2; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; sc = gp->softc; sc->sc_shutting_down = 1; if (sc->sc_orphaned) g_mountver_destroy(gp, 1); } g_topology_unlock(); } static void g_mountver_init(struct g_class *mp) { g_mountver_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_mountver_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mountver_pre_sync == NULL) G_MOUNTVER_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mountver_fini(struct g_class *mp) { if (g_mountver_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_mountver_pre_sync); } DECLARE_GEOM_CLASS(g_mountver_class, g_mountver); MODULE_VERSION(geom_mountver, 0); Index: head/sys/geom/mountver/g_mountver.h =================================================================== --- head/sys/geom/mountver/g_mountver.h (revision 350693) +++ head/sys/geom/mountver/g_mountver.h (revision 350694) @@ -1,74 +1,59 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Edward Tomasz Napierala * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_MOUNTVER_H_ #define _G_MOUNTVER_H_ #define G_MOUNTVER_CLASS_NAME "MOUNTVER" #define G_MOUNTVER_VERSION 4 #define G_MOUNTVER_SUFFIX ".mountver" #ifdef _KERNEL -#define G_MOUNTVER_DEBUG(lvl, ...) do { \ - if (g_mountver_debug >= (lvl)) { \ - printf("GEOM_MOUNTVER"); \ - if (g_mountver_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_MOUNTVER_LOGREQ(bp, ...) do { \ - if (g_mountver_debug >= 2) { \ - printf("GEOM_MOUNTVER[2]: "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_MOUNTVER_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_MOUNTVER", g_mountver_debug, (lvl), NULL, __VA_ARGS__) +#define G_MOUNTVER_LOGREQ(bp, ...) \ + _GEOM_DEBUG("GEOM_MOUNTVER", g_mountver_debug, 2, (bp), __VA_ARGS__) struct g_mountver_softc { TAILQ_HEAD(, bio) sc_queue; struct mtx sc_mtx; char *sc_provider_name; char sc_ident[DISK_IDENT_SIZE]; int sc_orphaned; int sc_shutting_down; int sc_access_r; int sc_access_w; int sc_access_e; }; #endif /* _KERNEL */ #endif /* _G_MOUNTVER_H_ */ Index: head/sys/geom/nop/g_nop.c =================================================================== --- head/sys/geom/nop/g_nop.c (revision 350693) +++ head/sys/geom/nop/g_nop.c (revision 350694) @@ -1,922 +1,923 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, nop, CTLFLAG_RW, 0, "GEOM_NOP stuff"); static u_int g_nop_debug = 0; SYSCTL_UINT(_kern_geom_nop, OID_AUTO, debug, CTLFLAG_RW, &g_nop_debug, 0, "Debug level"); static int g_nop_destroy(struct g_geom *gp, boolean_t force); static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb); static g_access_t g_nop_access; static g_dumpconf_t g_nop_dumpconf; static g_orphan_t g_nop_orphan; static g_provgone_t g_nop_providergone; static g_resize_t g_nop_resize; static g_start_t g_nop_start; struct g_class g_nop_class = { .name = G_NOP_CLASS_NAME, .version = G_VERSION, .ctlreq = g_nop_config, .destroy_geom = g_nop_destroy_geom, .access = g_nop_access, .dumpconf = g_nop_dumpconf, .orphan = g_nop_orphan, .providergone = g_nop_providergone, .resize = g_nop_resize, .start = g_nop_start, }; struct g_nop_delay { struct callout dl_cal; struct bio *dl_bio; TAILQ_ENTRY(g_nop_delay) dl_next; }; static void g_nop_orphan(struct g_consumer *cp) { g_topology_assert(); g_nop_destroy(cp->geom, 1); } static void g_nop_resize(struct g_consumer *cp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; off_t size; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc->sc_explicitsize != 0) return; if (cp->provider->mediasize < sc->sc_offset) { g_nop_destroy(gp, 1); return; } size = cp->provider->mediasize - sc->sc_offset; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, size); } static int g_nop_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset, size_t length) { return (0); } static void g_nop_kerneldump(struct bio *bp, struct g_nop_softc *sc) { struct g_kerneldump *gkd; struct g_geom *gp; struct g_provider *pp; gkd = (struct g_kerneldump *)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); pp = LIST_FIRST(&gp->provider); gkd->di.dumper = g_nop_dumper; gkd->di.priv = sc; gkd->di.blocksize = pp->sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = sc->sc_offset + gkd->offset; if (gkd->offset > sc->sc_explicitsize) { g_io_deliver(bp, ENODEV); return; } if (gkd->offset + gkd->length > sc->sc_explicitsize) gkd->length = sc->sc_explicitsize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_nop_pass(struct bio *cbp, struct g_geom *gp) { G_NOP_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_nop_pass_timeout(void *data) { struct g_nop_softc *sc; struct g_geom *gp; struct g_nop_delay *gndelay; gndelay = (struct g_nop_delay *)data; gp = gndelay->dl_bio->bio_to->geom; sc = gp->softc; mtx_lock(&sc->sc_lock); TAILQ_REMOVE(&sc->sc_head_delay, gndelay, dl_next); mtx_unlock(&sc->sc_lock); g_nop_pass(gndelay->dl_bio, gp); g_free(data); } static void g_nop_start(struct bio *bp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *cbp; u_int failprob, delayprob, delaytime; failprob = delayprob = 0; gp = bp->bio_to->geom; sc = gp->softc; G_NOP_LOGREQ(bp, "Request received."); mtx_lock(&sc->sc_lock); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; failprob = sc->sc_rfailprob; delayprob = sc->sc_rdelayprob; delaytime = sc->sc_delaymsec; break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; failprob = sc->sc_wfailprob; delayprob = sc->sc_wdelayprob; delaytime = sc->sc_delaymsec; break; case BIO_DELETE: sc->sc_deletes++; break; case BIO_GETATTR: sc->sc_getattrs++; if (sc->sc_physpath && g_handleattr_str(bp, "GEOM::physpath", sc->sc_physpath)) ; else if (strcmp(bp->bio_attribute, "GEOM::kerneldump") == 0) g_nop_kerneldump(bp, sc); else /* * Fallthrough to forwarding the GETATTR down to the * lower level device. */ break; mtx_unlock(&sc->sc_lock); return; case BIO_FLUSH: sc->sc_flushes++; break; case BIO_CMD0: sc->sc_cmd0s++; break; case BIO_CMD1: sc->sc_cmd1s++; break; case BIO_CMD2: sc->sc_cmd2s++; break; } mtx_unlock(&sc->sc_lock); if (failprob > 0) { u_int rval; rval = arc4random() % 100; if (rval < failprob) { G_NOP_LOGREQLVL(1, bp, "Returning error=%d.", sc->sc_error); g_io_deliver(bp, sc->sc_error); return; } } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; cbp->bio_offset = bp->bio_offset + sc->sc_offset; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL pp")); cbp->bio_to = pp; if (delayprob > 0) { struct g_nop_delay *gndelay; u_int rval; rval = arc4random() % 100; if (rval < delayprob) { gndelay = g_malloc(sizeof(*gndelay), M_NOWAIT | M_ZERO); if (gndelay != NULL) { callout_init(&gndelay->dl_cal, 1); gndelay->dl_bio = cbp; mtx_lock(&sc->sc_lock); TAILQ_INSERT_TAIL(&sc->sc_head_delay, gndelay, dl_next); mtx_unlock(&sc->sc_lock); callout_reset(&gndelay->dl_cal, MSEC_2_TICKS(delaytime), g_nop_pass_timeout, gndelay); return; } } } g_nop_pass(cbp, gp); } static int g_nop_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static int g_nop_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, int ioerror, u_int rfailprob, u_int wfailprob, u_int delaymsec, u_int rdelayprob, u_int wdelayprob, off_t offset, off_t size, u_int secsize, off_t stripesize, off_t stripeoffset, const char *physpath) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; char name[64]; int error; off_t explicitsize; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; if ((offset % pp->sectorsize) != 0) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } if ((size % pp->sectorsize) != 0) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (offset >= pp->mediasize) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } explicitsize = size; if (size == 0) size = pp->mediasize - offset; if (offset + size > pp->mediasize) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (secsize == 0) secsize = pp->sectorsize; else if ((secsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid secsize for provider %s.", pp->name); return (EINVAL); } if (secsize > MAXPHYS) { gctl_error(req, "secsize is too big."); return (EINVAL); } size -= size % secsize; if ((stripesize % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripesize for provider %s.", pp->name); return (EINVAL); } if ((stripeoffset % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripeoffset for provider %s.", pp->name); return (EINVAL); } if (stripesize != 0 && stripeoffset >= stripesize) { gctl_error(req, "stripeoffset is too big."); return (EINVAL); } snprintf(name, sizeof(name), "%s%s", pp->name, G_NOP_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_offset = offset; sc->sc_explicitsize = explicitsize; sc->sc_stripesize = stripesize; sc->sc_stripeoffset = stripeoffset; if (physpath && strcmp(physpath, G_NOP_PHYSPATH_PASSTHROUGH)) { sc->sc_physpath = strndup(physpath, MAXPATHLEN, M_GEOM); } else sc->sc_physpath = NULL; sc->sc_error = ioerror; sc->sc_rfailprob = rfailprob; sc->sc_wfailprob = wfailprob; sc->sc_delaymsec = delaymsec; sc->sc_rdelayprob = rdelayprob; sc->sc_wdelayprob = wdelayprob; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; TAILQ_INIT(&sc->sc_head_delay); mtx_init(&sc->sc_lock, "gnop lock", NULL, MTX_DEF); gp->softc = sc; newpp = g_new_providerf(gp, "%s", gp->name); newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; newpp->mediasize = size; newpp->sectorsize = secsize; newpp->stripesize = stripesize; newpp->stripeoffset = stripeoffset; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } newpp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(newpp, 0); G_NOP_DEBUG(0, "Device %s created.", gp->name); return (0); fail: if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_lock); free(sc->sc_physpath, M_GEOM); g_free(gp->softc); g_destroy_geom(gp); return (error); } static void g_nop_providergone(struct g_provider *pp) { struct g_geom *gp = pp->geom; struct g_nop_softc *sc = gp->softc; KASSERT(TAILQ_EMPTY(&sc->sc_head_delay), ("delayed request list is not empty")); gp->softc = NULL; free(sc->sc_physpath, M_GEOM); mtx_destroy(&sc->sc_lock); g_free(sc); } static int g_nop_destroy(struct g_geom *gp, boolean_t force) { struct g_nop_softc *sc; struct g_provider *pp; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_NOP_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_NOP_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_NOP_DEBUG(0, "Device %s removed.", gp->name); } g_wither_geom(gp, ENXIO); return (0); } static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_nop_destroy(gp, 0)); } static void g_nop_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; intmax_t *error, *rfailprob, *wfailprob, *offset, *secsize, *size, *stripesize, *stripeoffset, *delaymsec, *rdelayprob, *wdelayprob; const char *name, *physpath; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } error = gctl_get_paraml(req, "error", sizeof(*error)); if (error == NULL) { gctl_error(req, "No '%s' argument", "error"); return; } rfailprob = gctl_get_paraml(req, "rfailprob", sizeof(*rfailprob)); if (rfailprob == NULL) { gctl_error(req, "No '%s' argument", "rfailprob"); return; } if (*rfailprob < -1 || *rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } wfailprob = gctl_get_paraml(req, "wfailprob", sizeof(*wfailprob)); if (wfailprob == NULL) { gctl_error(req, "No '%s' argument", "wfailprob"); return; } if (*wfailprob < -1 || *wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } delaymsec = gctl_get_paraml(req, "delaymsec", sizeof(*delaymsec)); if (delaymsec == NULL) { gctl_error(req, "No '%s' argument", "delaymsec"); return; } if (*delaymsec < 1 && *delaymsec != -1) { gctl_error(req, "Invalid '%s' argument", "delaymsec"); return; } rdelayprob = gctl_get_paraml(req, "rdelayprob", sizeof(*rdelayprob)); if (rdelayprob == NULL) { gctl_error(req, "No '%s' argument", "rdelayprob"); return; } if (*rdelayprob < -1 || *rdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "rdelayprob"); return; } wdelayprob = gctl_get_paraml(req, "wdelayprob", sizeof(*wdelayprob)); if (wdelayprob == NULL) { gctl_error(req, "No '%s' argument", "wdelayprob"); return; } if (*wdelayprob < -1 || *wdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "wdelayprob"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "No '%s' argument", "offset"); return; } if (*offset < 0) { gctl_error(req, "Invalid '%s' argument", "offset"); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if (*size < 0) { gctl_error(req, "Invalid '%s' argument", "size"); return; } secsize = gctl_get_paraml(req, "secsize", sizeof(*secsize)); if (secsize == NULL) { gctl_error(req, "No '%s' argument", "secsize"); return; } if (*secsize < 0) { gctl_error(req, "Invalid '%s' argument", "secsize"); return; } stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument", "stripesize"); return; } if (*stripesize < 0) { gctl_error(req, "Invalid '%s' argument", "stripesize"); return; } stripeoffset = gctl_get_paraml(req, "stripeoffset", sizeof(*stripeoffset)); if (stripeoffset == NULL) { gctl_error(req, "No '%s' argument", "stripeoffset"); return; } if (*stripeoffset < 0) { gctl_error(req, "Invalid '%s' argument", "stripeoffset"); return; } physpath = gctl_get_asciiparam(req, "physpath"); for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } if (g_nop_create(req, mp, pp, *error == -1 ? EIO : (int)*error, *rfailprob == -1 ? 0 : (u_int)*rfailprob, *wfailprob == -1 ? 0 : (u_int)*wfailprob, *delaymsec == -1 ? 1 : (u_int)*delaymsec, *rdelayprob == -1 ? 0 : (u_int)*rdelayprob, *wdelayprob == -1 ? 0 : (u_int)*wdelayprob, (off_t)*offset, (off_t)*size, (u_int)*secsize, (off_t)*stripesize, (off_t)*stripeoffset, physpath) != 0) { return; } } } static void g_nop_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; intmax_t *delaymsec, *error, *rdelayprob, *rfailprob, *wdelayprob, *wfailprob; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } error = gctl_get_paraml(req, "error", sizeof(*error)); if (error == NULL) { gctl_error(req, "No '%s' argument", "error"); return; } rfailprob = gctl_get_paraml(req, "rfailprob", sizeof(*rfailprob)); if (rfailprob == NULL) { gctl_error(req, "No '%s' argument", "rfailprob"); return; } if (*rfailprob < -1 || *rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } wfailprob = gctl_get_paraml(req, "wfailprob", sizeof(*wfailprob)); if (wfailprob == NULL) { gctl_error(req, "No '%s' argument", "wfailprob"); return; } if (*wfailprob < -1 || *wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } delaymsec = gctl_get_paraml(req, "delaymsec", sizeof(*delaymsec)); if (delaymsec == NULL) { gctl_error(req, "No '%s' argument", "delaymsec"); return; } if (*delaymsec < 1 && *delaymsec != -1) { gctl_error(req, "Invalid '%s' argument", "delaymsec"); return; } rdelayprob = gctl_get_paraml(req, "rdelayprob", sizeof(*rdelayprob)); if (rdelayprob == NULL) { gctl_error(req, "No '%s' argument", "rdelayprob"); return; } if (*rdelayprob < -1 || *rdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "rdelayprob"); return; } wdelayprob = gctl_get_paraml(req, "wdelayprob", sizeof(*wdelayprob)); if (wdelayprob == NULL) { gctl_error(req, "No '%s' argument", "wdelayprob"); return; } if (*wdelayprob < -1 || *wdelayprob > 100) { gctl_error(req, "Invalid '%s' argument", "wdelayprob"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } sc = pp->geom->softc; if (*error != -1) sc->sc_error = (int)*error; if (*rfailprob != -1) sc->sc_rfailprob = (u_int)*rfailprob; if (*wfailprob != -1) sc->sc_wfailprob = (u_int)*wfailprob; if (*rdelayprob != -1) sc->sc_rdelayprob = (u_int)*rdelayprob; if (*wdelayprob != -1) sc->sc_wdelayprob = (u_int)*wdelayprob; if (*delaymsec != -1) sc->sc_delaymsec = (u_int)*delaymsec; } } static struct g_geom * g_nop_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_nop_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); gp = g_nop_find_geom(mp, name); if (gp == NULL) { G_NOP_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_nop_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_nop_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } sc = pp->geom->softc; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; } } static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_NOP_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_nop_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_nop_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_nop_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_nop_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_nop_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_nop_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_offset); sbuf_printf(sb, "%s%u\n", indent, sc->sc_rfailprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_wfailprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_rdelayprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_wdelayprob); sbuf_printf(sb, "%s%d\n", indent, sc->sc_delaymsec); sbuf_printf(sb, "%s%d\n", indent, sc->sc_error); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_deletes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_getattrs); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_flushes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd0s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd1s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd2s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_nop_class, g_nop); MODULE_VERSION(geom_nop, 0); Index: head/sys/geom/nop/g_nop.h =================================================================== --- head/sys/geom/nop/g_nop.h (revision 350693) +++ head/sys/geom/nop/g_nop.h (revision 350694) @@ -1,96 +1,81 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_NOP_H_ #define _G_NOP_H_ #define G_NOP_CLASS_NAME "NOP" #define G_NOP_VERSION 4 #define G_NOP_SUFFIX ".nop" /* * Special flag to instruct gnop to passthrough the underlying provider's * physical path */ #define G_NOP_PHYSPATH_PASSTHROUGH "\255" #ifdef _KERNEL -#define G_NOP_DEBUG(lvl, ...) do { \ - if (g_nop_debug >= (lvl)) { \ - printf("GEOM_NOP"); \ - if (g_nop_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) +#define G_NOP_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_NOP", g_nop_debug, (lvl), NULL, __VA_ARGS__) +#define G_NOP_LOGREQLVL(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_NOP", g_nop_debug, (lvl), (bp), __VA_ARGS__) #define G_NOP_LOGREQ(bp, ...) G_NOP_LOGREQLVL(2, bp, __VA_ARGS__) -#define G_NOP_LOGREQLVL(lvl, bp, ...) do { \ - if (g_nop_debug >= (lvl)) { \ - printf("GEOM_NOP[%d]: ", (lvl)); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) struct g_nop_delay; TAILQ_HEAD(g_nop_delay_head, g_nop_delay); struct g_nop_softc { int sc_error; off_t sc_offset; off_t sc_explicitsize; off_t sc_stripesize; off_t sc_stripeoffset; u_int sc_rfailprob; u_int sc_wfailprob; u_int sc_delaymsec; u_int sc_rdelayprob; u_int sc_wdelayprob; uintmax_t sc_reads; uintmax_t sc_writes; uintmax_t sc_deletes; uintmax_t sc_getattrs; uintmax_t sc_flushes; uintmax_t sc_cmd0s; uintmax_t sc_cmd1s; uintmax_t sc_cmd2s; uintmax_t sc_readbytes; uintmax_t sc_wrotebytes; char *sc_physpath; struct mtx sc_lock; struct g_nop_delay_head sc_head_delay; }; #endif /* _KERNEL */ #endif /* _G_NOP_H_ */ Index: head/sys/geom/raid/g_raid.c =================================================================== --- head/sys/geom/raid/g_raid.c (revision 350693) +++ head/sys/geom/raid/g_raid.c (revision 350694) @@ -1,2571 +1,2572 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include "g_raid_md_if.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); int g_raid_enable = 1; SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN, &g_raid_enable, 0, "Enable on-disk metadata taste"); u_int g_raid_aggressive_spare = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN, &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); u_int g_raid_debug = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0, "Debug level"); int g_raid_read_err_thresh = 10; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN, &g_raid_read_err_thresh, 0, "Number of read errors equated to disk failure"); u_int g_raid_start_timeout = 30; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN, &g_raid_start_timeout, 0, "Time to wait for all array components"); static u_int g_raid_clean_time = 5; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN, &g_raid_clean_time, 0, "Mark volume as clean when idling"); static u_int g_raid_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid_name_format = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN, &g_raid_name_format, 0, "Providers name format."); static u_int g_raid_idle_threshold = 1000000; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN, &g_raid_idle_threshold, 1000000, "Time in microseconds to consider a volume idle."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) LIST_HEAD(, g_raid_md_class) g_raid_md_classes = LIST_HEAD_INITIALIZER(g_raid_md_classes); LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = LIST_HEAD_INITIALIZER(g_raid_tr_classes); LIST_HEAD(, g_raid_volume) g_raid_volumes = LIST_HEAD_INITIALIZER(g_raid_volumes); static eventhandler_tag g_raid_post_sync = NULL; static int g_raid_started = 0; static int g_raid_shutdown = 0; static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid_taste; static void g_raid_init(struct g_class *mp); static void g_raid_fini(struct g_class *mp); struct g_class g_raid_class = { .name = G_RAID_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid_ctl, .taste = g_raid_taste, .destroy_geom = g_raid_destroy_geom, .init = g_raid_init, .fini = g_raid_fini }; static void g_raid_destroy_provider(struct g_raid_volume *vol); static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); static int g_raid_update_node(struct g_raid_softc *sc, u_int event); static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid_start(struct bio *bp); static void g_raid_start_request(struct bio *bp); static void g_raid_disk_done(struct bio *bp); static void g_raid_poll(struct g_raid_softc *sc); static const char * g_raid_node_event2str(int event) { switch (event) { case G_RAID_NODE_E_WAKE: return ("WAKE"); case G_RAID_NODE_E_START: return ("START"); default: return ("INVALID"); } } const char * g_raid_disk_state2str(int state) { switch (state) { case G_RAID_DISK_S_NONE: return ("NONE"); case G_RAID_DISK_S_OFFLINE: return ("OFFLINE"); case G_RAID_DISK_S_DISABLED: return ("DISABLED"); case G_RAID_DISK_S_FAILED: return ("FAILED"); case G_RAID_DISK_S_STALE_FAILED: return ("STALE_FAILED"); case G_RAID_DISK_S_SPARE: return ("SPARE"); case G_RAID_DISK_S_STALE: return ("STALE"); case G_RAID_DISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_disk_event2str(int event) { switch (event) { case G_RAID_DISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_subdisk_state2str(int state) { switch (state) { case G_RAID_SUBDISK_S_NONE: return ("NONE"); case G_RAID_SUBDISK_S_FAILED: return ("FAILED"); case G_RAID_SUBDISK_S_NEW: return ("NEW"); case G_RAID_SUBDISK_S_REBUILD: return ("REBUILD"); case G_RAID_SUBDISK_S_UNINITIALIZED: return ("UNINITIALIZED"); case G_RAID_SUBDISK_S_STALE: return ("STALE"); case G_RAID_SUBDISK_S_RESYNC: return ("RESYNC"); case G_RAID_SUBDISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_subdisk_event2str(int event) { switch (event) { case G_RAID_SUBDISK_E_NEW: return ("NEW"); case G_RAID_SUBDISK_E_FAILED: return ("FAILED"); case G_RAID_SUBDISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_volume_state2str(int state) { switch (state) { case G_RAID_VOLUME_S_STARTING: return ("STARTING"); case G_RAID_VOLUME_S_BROKEN: return ("BROKEN"); case G_RAID_VOLUME_S_DEGRADED: return ("DEGRADED"); case G_RAID_VOLUME_S_SUBOPTIMAL: return ("SUBOPTIMAL"); case G_RAID_VOLUME_S_OPTIMAL: return ("OPTIMAL"); case G_RAID_VOLUME_S_UNSUPPORTED: return ("UNSUPPORTED"); case G_RAID_VOLUME_S_STOPPED: return ("STOPPED"); default: return ("INVALID"); } } static const char * g_raid_volume_event2str(int event) { switch (event) { case G_RAID_VOLUME_E_UP: return ("UP"); case G_RAID_VOLUME_E_DOWN: return ("DOWN"); case G_RAID_VOLUME_E_START: return ("START"); case G_RAID_VOLUME_E_STARTMD: return ("STARTMD"); default: return ("INVALID"); } } const char * g_raid_volume_level2str(int level, int qual) { switch (level) { case G_RAID_VOLUME_RL_RAID0: return ("RAID0"); case G_RAID_VOLUME_RL_RAID1: return ("RAID1"); case G_RAID_VOLUME_RL_RAID3: if (qual == G_RAID_VOLUME_RLQ_R3P0) return ("RAID3-P0"); if (qual == G_RAID_VOLUME_RLQ_R3PN) return ("RAID3-PN"); return ("RAID3"); case G_RAID_VOLUME_RL_RAID4: if (qual == G_RAID_VOLUME_RLQ_R4P0) return ("RAID4-P0"); if (qual == G_RAID_VOLUME_RLQ_R4PN) return ("RAID4-PN"); return ("RAID4"); case G_RAID_VOLUME_RL_RAID5: if (qual == G_RAID_VOLUME_RLQ_R5RA) return ("RAID5-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RS) return ("RAID5-RS"); if (qual == G_RAID_VOLUME_RLQ_R5LA) return ("RAID5-LA"); if (qual == G_RAID_VOLUME_RLQ_R5LS) return ("RAID5-LS"); return ("RAID5"); case G_RAID_VOLUME_RL_RAID6: if (qual == G_RAID_VOLUME_RLQ_R6RA) return ("RAID6-RA"); if (qual == G_RAID_VOLUME_RLQ_R6RS) return ("RAID6-RS"); if (qual == G_RAID_VOLUME_RLQ_R6LA) return ("RAID6-LA"); if (qual == G_RAID_VOLUME_RLQ_R6LS) return ("RAID6-LS"); return ("RAID6"); case G_RAID_VOLUME_RL_RAIDMDF: if (qual == G_RAID_VOLUME_RLQ_RMDFRA) return ("RAIDMDF-RA"); if (qual == G_RAID_VOLUME_RLQ_RMDFRS) return ("RAIDMDF-RS"); if (qual == G_RAID_VOLUME_RLQ_RMDFLA) return ("RAIDMDF-LA"); if (qual == G_RAID_VOLUME_RLQ_RMDFLS) return ("RAIDMDF-LS"); return ("RAIDMDF"); case G_RAID_VOLUME_RL_RAID1E: if (qual == G_RAID_VOLUME_RLQ_R1EA) return ("RAID1E-A"); if (qual == G_RAID_VOLUME_RLQ_R1EO) return ("RAID1E-O"); return ("RAID1E"); case G_RAID_VOLUME_RL_SINGLE: return ("SINGLE"); case G_RAID_VOLUME_RL_CONCAT: return ("CONCAT"); case G_RAID_VOLUME_RL_RAID5E: if (qual == G_RAID_VOLUME_RLQ_R5ERA) return ("RAID5E-RA"); if (qual == G_RAID_VOLUME_RLQ_R5ERS) return ("RAID5E-RS"); if (qual == G_RAID_VOLUME_RLQ_R5ELA) return ("RAID5E-LA"); if (qual == G_RAID_VOLUME_RLQ_R5ELS) return ("RAID5E-LS"); return ("RAID5E"); case G_RAID_VOLUME_RL_RAID5EE: if (qual == G_RAID_VOLUME_RLQ_R5EERA) return ("RAID5EE-RA"); if (qual == G_RAID_VOLUME_RLQ_R5EERS) return ("RAID5EE-RS"); if (qual == G_RAID_VOLUME_RLQ_R5EELA) return ("RAID5EE-LA"); if (qual == G_RAID_VOLUME_RLQ_R5EELS) return ("RAID5EE-LS"); return ("RAID5EE"); case G_RAID_VOLUME_RL_RAID5R: if (qual == G_RAID_VOLUME_RLQ_R5RRA) return ("RAID5R-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RRS) return ("RAID5R-RS"); if (qual == G_RAID_VOLUME_RLQ_R5RLA) return ("RAID5R-LA"); if (qual == G_RAID_VOLUME_RLQ_R5RLS) return ("RAID5R-LS"); return ("RAID5E"); default: return ("UNKNOWN"); } } int g_raid_volume_str2level(const char *str, int *level, int *qual) { *level = G_RAID_VOLUME_RL_UNKNOWN; *qual = G_RAID_VOLUME_RLQ_NONE; if (strcasecmp(str, "RAID0") == 0) *level = G_RAID_VOLUME_RL_RAID0; else if (strcasecmp(str, "RAID1") == 0) *level = G_RAID_VOLUME_RL_RAID1; else if (strcasecmp(str, "RAID3-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3P0; } else if (strcasecmp(str, "RAID3-PN") == 0 || strcasecmp(str, "RAID3") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3PN; } else if (strcasecmp(str, "RAID4-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4P0; } else if (strcasecmp(str, "RAID4-PN") == 0 || strcasecmp(str, "RAID4") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4PN; } else if (strcasecmp(str, "RAID5-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RA; } else if (strcasecmp(str, "RAID5-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RS; } else if (strcasecmp(str, "RAID5") == 0 || strcasecmp(str, "RAID5-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LA; } else if (strcasecmp(str, "RAID5-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LS; } else if (strcasecmp(str, "RAID6-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RA; } else if (strcasecmp(str, "RAID6-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RS; } else if (strcasecmp(str, "RAID6") == 0 || strcasecmp(str, "RAID6-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LA; } else if (strcasecmp(str, "RAID6-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LS; } else if (strcasecmp(str, "RAIDMDF-RA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRA; } else if (strcasecmp(str, "RAIDMDF-RS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRS; } else if (strcasecmp(str, "RAIDMDF") == 0 || strcasecmp(str, "RAIDMDF-LA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLA; } else if (strcasecmp(str, "RAIDMDF-LS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLS; } else if (strcasecmp(str, "RAID10") == 0 || strcasecmp(str, "RAID1E") == 0 || strcasecmp(str, "RAID1E-A") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EA; } else if (strcasecmp(str, "RAID1E-O") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EO; } else if (strcasecmp(str, "SINGLE") == 0) *level = G_RAID_VOLUME_RL_SINGLE; else if (strcasecmp(str, "CONCAT") == 0) *level = G_RAID_VOLUME_RL_CONCAT; else if (strcasecmp(str, "RAID5E-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERA; } else if (strcasecmp(str, "RAID5E-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERS; } else if (strcasecmp(str, "RAID5E") == 0 || strcasecmp(str, "RAID5E-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELA; } else if (strcasecmp(str, "RAID5E-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELS; } else if (strcasecmp(str, "RAID5EE-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERA; } else if (strcasecmp(str, "RAID5EE-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERS; } else if (strcasecmp(str, "RAID5EE") == 0 || strcasecmp(str, "RAID5EE-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELA; } else if (strcasecmp(str, "RAID5EE-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELS; } else if (strcasecmp(str, "RAID5R-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRA; } else if (strcasecmp(str, "RAID5R-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRS; } else if (strcasecmp(str, "RAID5R") == 0 || strcasecmp(str, "RAID5R-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLA; } else if (strcasecmp(str, "RAID5R-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLS; } else return (-1); return (0); } const char * g_raid_get_diskname(struct g_raid_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_consumer->provider->name); } void g_raid_get_disk_info(struct g_raid_disk *disk) { struct g_consumer *cp = disk->d_consumer; int error, len; /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); if (error) disk->d_kd.di.dumper = NULL; if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG1(2, disk->d_softc, "Dumping not supported by %s: %d.", cp->provider->name, error); /* Read BIO_DELETE support. */ error = g_getattr("GEOM::candelete", cp, &disk->d_candelete); if (error) disk->d_candelete = 0; if (!disk->d_candelete) G_RAID_DEBUG1(2, disk->d_softc, "BIO_DELETE not supported by %s: %d.", cp->provider->name, error); } void g_raid_report_disk_state(struct g_raid_disk *disk) { struct g_raid_subdisk *sd; int len, state; uint32_t s; if (disk->d_consumer == NULL) return; if (disk->d_state == G_RAID_DISK_S_DISABLED) { s = G_STATE_ACTIVE; /* XXX */ } else if (disk->d_state == G_RAID_DISK_S_FAILED || disk->d_state == G_RAID_DISK_S_STALE_FAILED) { s = G_STATE_FAILED; } else { state = G_RAID_SUBDISK_S_ACTIVE; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { if (sd->sd_state < state) state = sd->sd_state; } if (state == G_RAID_SUBDISK_S_FAILED) s = G_STATE_FAILED; else if (state == G_RAID_SUBDISK_S_NEW || state == G_RAID_SUBDISK_S_REBUILD) s = G_STATE_REBUILD; else if (state == G_RAID_SUBDISK_S_STALE || state == G_RAID_SUBDISK_S_RESYNC) s = G_STATE_RESYNC; else s = G_STATE_ACTIVE; } len = sizeof(s); g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", g_raid_get_diskname(disk), s); } void g_raid_change_disk_state(struct g_raid_disk *disk, int state) { G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", g_raid_get_diskname(disk), g_raid_disk_state2str(disk->d_state), g_raid_disk_state2str(state)); disk->d_state = state; g_raid_report_disk_state(disk); } void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) { G_RAID_DEBUG1(0, sd->sd_softc, "Subdisk %s:%d-%s state changed from %s to %s.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", g_raid_subdisk_state2str(sd->sd_state), g_raid_subdisk_state2str(state)); sd->sd_state = state; if (sd->sd_disk) g_raid_report_disk_state(sd->sd_disk); } void g_raid_change_volume_state(struct g_raid_volume *vol, int state) { G_RAID_DEBUG1(0, vol->v_softc, "Volume %s state changed from %s to %s.", vol->v_name, g_raid_volume_state2str(vol->v_state), g_raid_volume_state2str(state)); vol->v_state = state; } /* * --- Events handling functions --- * Events in geom_raid are used to maintain subdisks and volumes status * from one thread to simplify locking. */ static void g_raid_event_free(struct g_raid_event *ep) { free(ep, M_RAID); } int g_raid_event_send(void *arg, int event, int flags) { struct g_raid_softc *sc; struct g_raid_event *ep; int error; if ((flags & G_RAID_EVENT_VOLUME) != 0) { sc = ((struct g_raid_volume *)arg)->v_softc; } else if ((flags & G_RAID_EVENT_DISK) != 0) { sc = ((struct g_raid_disk *)arg)->d_softc; } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { sc = ((struct g_raid_subdisk *)arg)->sd_softc; } else { sc = arg; } ep = malloc(sizeof(*ep), M_RAID, sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->e_tgt = arg; ep->e_event = event; ep->e_flags = flags; ep->e_error = 0; G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); if ((flags & G_RAID_EVENT_WAIT) == 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { mtx_lock(&sc->sc_queue_mtx); MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_raid_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static void g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep, *tmpep; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if (ep->e_tgt != tgt) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) g_raid_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_queue_mtx); } static int g_raid_event_check(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep; int res = 0; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(ep, &sc->sc_events, e_next) { if (ep->e_tgt != tgt) continue; res = 1; break; } mtx_unlock(&sc->sc_queue_mtx); return (res); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_ndisks(struct g_raid_softc *sc, int state) { struct g_raid_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == state || state == -1) n++; } return (n); } /* * Return the number of subdisks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *subdisk; struct g_raid_softc *sc; u_int i, n ; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; for (i = 0; i < vol->v_disks_count; i++) { subdisk = &vol->v_subdisks[i]; if ((state == -1 && subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || subdisk->sd_state == state) n++; } return (n); } /* * Return the first subdisk in given state. * If state is equal to -1, then the first connected disks. */ struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *sd; struct g_raid_softc *sc; u_int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((state == -1 && sd->sd_state != G_RAID_SUBDISK_S_NONE) || sd->sd_state == state) return (sd); } return (NULL); } struct g_consumer * g_raid_open_consumer(struct g_raid_softc *sc, const char *name) { struct g_consumer *cp; struct g_provider *pp; g_topology_assert(); if (strncmp(name, "/dev/", 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) return (NULL); cp = g_new_consumer(sc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 1, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } return (cp); } static u_int g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } u_int g_raid_nopens(struct g_raid_softc *sc) { struct g_raid_volume *vol; u_int opens; opens = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_provider_open != 0) opens++; } return (opens); } static int g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid_nrequests(sc, cp) > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert_not(); g_topology_lock(); cp->private = NULL; if (g_raid_consumer_is_busy(sc, cp)) goto out; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); goto out; } G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); out: g_topology_unlock(); } static void g_raid_orphan(struct g_consumer *cp) { struct g_raid_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, G_RAID_EVENT_DISK); } static void g_raid_clean(struct g_raid_volume *vol, int acw) { struct g_raid_softc *sc; int timeout; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; if (!vol->v_dirty) return; if (vol->v_writes > 0) return; if (acw > 0 || (acw == -1 && vol->v_provider != NULL && vol->v_provider->acw > 0)) { timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); if (!g_raid_shutdown && timeout > 0) return; } vol->v_dirty = 0; G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } static void g_raid_dirty(struct g_raid_volume *vol) { struct g_raid_softc *sc; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; vol->v_dirty = 1; G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) continue; cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_kerneldump_common_done(struct bio *bp) { bp->bio_flags |= BIO_DONE; } int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct bio bp; vol = tr->tro_volume; sc = vol->v_softc; g_reset_bio(&bp); bp.bio_cmd = BIO_WRITE; bp.bio_done = g_raid_tr_kerneldump_common_done; bp.bio_attribute = NULL; bp.bio_offset = offset; bp.bio_length = length; bp.bio_data = virtual; bp.bio_to = vol->v_provider; g_raid_start(&bp); while (!(bp.bio_flags & BIO_DONE)) { G_RAID_DEBUG1(4, sc, "Poll..."); g_raid_poll(sc); DELAY(10); } return (bp.bio_error != 0 ? EIO : 0); } static int g_raid_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; int error; vol = (struct g_raid_volume *)arg; G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", (long long unsigned)offset, (long long unsigned)length); error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, physical, offset, length); return (error); } static void g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) { struct g_kerneldump *gkd; struct g_provider *pp; struct g_raid_volume *vol; gkd = (struct g_kerneldump*)bp->bio_data; pp = bp->bio_to; vol = pp->private; g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); gkd->di.dumper = g_raid_dump; gkd->di.priv = vol; gkd->di.blocksize = vol->v_sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > vol->v_mediasize) gkd->length = vol->v_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_raid_candelete(struct g_raid_softc *sc, struct bio *bp) { struct g_provider *pp; struct g_raid_volume *vol; struct g_raid_subdisk *sd; int i, val; pp = bp->bio_to; vol = pp->private; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if (sd->sd_disk->d_candelete) break; } val = i < vol->v_disks_count; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_raid_start(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid_start() should not be called at all. */ // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, // ("Provider's error should be set (error=%d)(mirror=%s).", // bp->bio_to->error, bp->bio_to->name)); G_RAID_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) g_raid_candelete(sc, bp); else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_raid_kerneldump(sc, bp); else g_io_deliver(bp, EOPNOTSUPP); return; default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) { G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); wakeup(sc); } } static int g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) { /* * 5 cases: * (1) bp entirely below NO * (2) bp entirely above NO * (3) bp start below, but end in range YES * (4) bp entirely within YES * (5) bp starts within, ends above YES * * lock range 10-19 (offset 10 length 10) * (1) 1-5: first if kicks it out * (2) 30-35: second if kicks it out * (3) 5-15: passes both ifs * (4) 12-14: passes both ifs * (5) 19-20: passes both */ off_t lend = lstart + len - 1; off_t bstart = bp->bio_offset; off_t bend = bp->bio_offset + bp->bio_length - 1; if (bend < lstart) return (0); if (lend < bstart) return (0); return (1); } static int g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) { struct g_raid_lock *lp; sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); LIST_FOREACH(lp, &vol->v_locks, l_next) { if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) return (1); } return (0); } static void g_raid_start_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; /* * Check to see if this item is in a locked range. If so, * queue it to our locked queue and return. We'll requeue * it when the range is unlocked. Internal I/O for the * rebuild/rescan/recovery process is excluded from this * check so we can actually do the recovery. */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { G_RAID_LOGREQ(3, bp, "Defer request."); bioq_insert_tail(&vol->v_locked, bp); return; } /* * If we're actually going to do the write/delete, then * update the idle stats for the volume. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { if (!vol->v_dirty) g_raid_dirty(vol); vol->v_writes++; } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. Then tell * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); G_RAID_LOGREQ(4, bp, "Request started"); G_RAID_TR_IOSTART(vol->v_tr, bp); } static void g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) { off_t off, len; struct bio *nbp; struct g_raid_lock *lp; vol->v_pending_lock = 0; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_pending) { off = lp->l_offset; len = lp->l_length; lp->l_pending = 0; TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(nbp, off, len)) lp->l_pending++; } if (lp->l_pending) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock(%jd, %jd) has %d pending", (intmax_t)off, (intmax_t)(off + len), lp->l_pending); continue; } G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock of %jd to %jd completed", (intmax_t)off, (intmax_t)(off + len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); } } } void g_raid_iodone(struct bio *bp, int error) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); /* Update stats if we done write/delete. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { vol->v_writes--; vol->v_last_write = time_uptime; } bioq_remove(&vol->v_inflight, bp); if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) g_raid_finish_with_locked_ranges(vol, bp); getmicrouptime(&vol->v_last_done); g_io_deliver(bp, error); } int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp) { struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; lp->l_pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) lp->l_pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ if (lp->l_pending > 0) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", (intmax_t)off, (intmax_t)(off+len), lp->l_pending); return (EBUSY); } G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", (intmax_t)off, (intmax_t)(off+len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { struct g_raid_lock *lp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX * Right now we just put them all back on the queue * and hope for the best. We hope this because any * locked ranges will go right back on this list * when the worker thread runs. * XXX */ G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", (intmax_t)lp->l_offset, (intmax_t)(lp->l_offset+lp->l_length)); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); free(lp, M_RAID); return (0); } } return (EINVAL); } void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { struct g_consumer *cp; struct g_raid_disk *disk, *tdisk; bp->bio_caller1 = sd; /* * Make sure that the disk is present. Generally it is a task of * transformation layers to not send requests to absent disks, but * it is better to be safe and report situation then sorry. */ if (sd->sd_disk == NULL) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); nodisk: bp->bio_from = NULL; bp->bio_to = NULL; bp->bio_error = ENXIO; g_raid_disk_done(bp); return; } disk = sd->sd_disk; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_FAILED) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); goto nodisk; } cp = disk->d_consumer; bp->bio_from = cp; bp->bio_to = cp->provider; cp->index++; /* Update average disks load. */ TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { if (tdisk->d_consumer == NULL) tdisk->d_load = 0; else tdisk->d_load = (tdisk->d_consumer->index * G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; } disk->d_last_offset = bp->bio_offset + bp->bio_length; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { bp->bio_error = g_raid_subdisk_kerneldump(sd, bp->bio_data, 0, bp->bio_offset, bp->bio_length); } else bp->bio_error = EOPNOTSUPP; g_raid_disk_done(bp); } else { bp->bio_done = g_raid_disk_done; bp->bio_offset += sd->sd_offset; G_RAID_LOGREQ(3, bp, "Sending request."); g_io_request(bp, cp); } } int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (sd->sd_disk == NULL) return (ENXIO); if (sd->sd_disk->d_kd.di.dumper == NULL) return (EOPNOTSUPP); return (dump_write(&sd->sd_disk->d_kd.di, virtual, physical, sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length)); } static void g_raid_disk_done(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; sd = bp->bio_caller1; sc = sd->sd_softc; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) wakeup(sc); } static void g_raid_disk_done_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_volume *vol; g_topology_assert_not(); G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); sd = bp->bio_caller1; sc = sd->sd_softc; vol = sd->sd_volume; if (bp->bio_from != NULL) { bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) g_raid_kill_consumer(sc, bp->bio_from); } bp->bio_offset -= sd->sd_offset; G_RAID_TR_IODONE(vol->v_tr, sd, bp); } static void g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); else ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid_event_free(ep); } else { ep->e_flags |= G_RAID_EVENT_DONE; G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); mtx_lock(&sc->sc_queue_mtx); wakeup(ep); mtx_unlock(&sc->sc_queue_mtx); } } /* * Worker thread. */ static void g_raid_worker(void *arg) { struct g_raid_softc *sc; struct g_raid_event *ep; struct g_raid_volume *vol; struct bio *bp; struct timeval now, t; int timeout, rv; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ bp = NULL; vol = NULL; rv = 0; ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) TAILQ_REMOVE(&sc->sc_events, ep, e_next); else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { getmicrouptime(&now); t = now; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr && timevalcmp(&vol->v_last_done, &t, < )) t = vol->v_last_done; } timevalsub(&t, &now); timeout = g_raid_idle_threshold + t.tv_sec * 1000000 + t.tv_usec; if (timeout > 0) { /* * Two steps to avoid overflows at HZ=1000 * and idle timeouts > 2.1s. Some rounding * errors can occur, but they are < 1tick, * which is deemed to be close enough for * this purpose. */ int micpertic = 1000000 / hz; timeout = (timeout + micpertic - 1) / micpertic; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", timeout); sx_xlock(&sc->sc_lock); goto process; } else rv = EWOULDBLOCK; } mtx_unlock(&sc->sc_queue_mtx); process: if (ep != NULL) { g_raid_handle_event(sc, ep); } else if (bp != NULL) { if (bp->bio_to != NULL && bp->bio_to->geom == sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } else if (rv == EWOULDBLOCK) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_clean(vol, -1); if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr) { t.tv_sec = g_raid_idle_threshold / 1000000; t.tv_usec = g_raid_idle_threshold % 1000000; timevaladd(&t, &vol->v_last_done); getmicrouptime(&now); if (timevalcmp(&t, &now, <= )) { G_RAID_TR_IDLE(vol->v_tr); vol->v_last_done = now; } } } } if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_destroy_node(sc, 1); /* May not return. */ } } static void g_raid_poll(struct g_raid_softc *sc) { struct g_raid_event *ep; struct bio *bp; sx_xlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) { TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); g_raid_handle_event(sc, ep); goto out; } bp = bioq_takefirst(&sc->sc_queue); if (bp != NULL) { mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } out: sx_xunlock(&sc->sc_lock); } static void g_raid_launch_provider(struct g_raid_volume *vol) { struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_softc *sc; struct g_provider *pp; char name[G_RAID_MAX_VOLUMENAME]; off_t off; int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); /* Try to name provider with volume name. */ snprintf(name, sizeof(name), "raid/%s", vol->v_name); if (g_raid_name_format == 0 || vol->v_name[0] == 0 || g_provider_by_name(name) != NULL) { /* Otherwise use sequential volume number. */ snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); } pp = g_new_providerf(sc->sc_geom, "%s", name); pp->flags |= G_PF_DIRECT_RECEIVE; if (vol->v_tr->tro_class->trc_accept_unmapped) { pp->flags |= G_PF_ACCEPT_UNMAPPED; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if ((sd->sd_disk->d_consumer->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->private = vol; pp->mediasize = vol->v_mediasize; pp->sectorsize = vol->v_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { if ((disk = vol->v_subdisks[0].sd_disk) != NULL && disk->d_consumer != NULL && disk->d_consumer->provider != NULL) { pp->stripesize = disk->d_consumer->provider->stripesize; off = disk->d_consumer->provider->stripeoffset; pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; if (off > 0) pp->stripeoffset %= off; } if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { pp->stripesize *= (vol->v_disks_count - 1); pp->stripeoffset *= (vol->v_disks_count - 1); } } else pp->stripesize = vol->v_strip_size; vol->v_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", pp->name, vol->v_name); } static void g_raid_destroy_provider(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_provider *pp; struct bio *bp, *tmp; g_topology_assert_not(); sc = vol->v_softc; pp = vol->v_provider; KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); g_topology_lock(); g_error_provider(pp, ENXIO); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { if (bp->bio_to != pp) continue; bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", pp->name, vol->v_name); g_wither_provider(pp, ENXIO); g_topology_unlock(); vol->v_provider = NULL; } /* * Update device state. */ static int g_raid_update_volume(struct g_raid_volume *vol, u_int event) { struct g_raid_softc *sc; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", g_raid_volume_event2str(event), vol->v_name); switch (event) { case G_RAID_VOLUME_E_DOWN: if (vol->v_provider != NULL) g_raid_destroy_provider(vol); break; case G_RAID_VOLUME_E_UP: if (vol->v_provider == NULL) g_raid_launch_provider(vol); break; case G_RAID_VOLUME_E_START: if (vol->v_tr) G_RAID_TR_START(vol->v_tr); return (0); default: if (sc->sc_md) G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); return (0); } /* Manage root mount release. */ if (vol->v_starting) { vol->v_starting = 0; G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); root_mount_rel(vol->v_rootmount); vol->v_rootmount = NULL; } if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); return (0); } /* * Update subdisk state. */ static int g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = sd->sd_softc; vol = sd->sd_volume; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", g_raid_subdisk_event2str(event), vol->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); if (vol->v_tr) G_RAID_TR_EVENT(vol->v_tr, sd, event); return (0); } /* * Update disk state. */ static int g_raid_update_disk(struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", g_raid_disk_event2str(event), g_raid_get_diskname(disk)); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, disk, event); return (0); } /* * Node event. */ static int g_raid_update_node(struct g_raid_softc *sc, u_int event) { sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for the array.", g_raid_node_event2str(event)); if (event == G_RAID_NODE_E_WAKE) return (0); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, NULL, event); return (0); } static int g_raid_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid_volume *vol; struct g_raid_softc *sc; int dcw, opens, error = 0; g_topology_assert(); sc = pp->geom->softc; vol = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcw = pp->acw + acw; g_topology_unlock(); sx_xlock(&sc->sc_lock); /* Deny new opens while dying. */ if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { error = ENXIO; goto out; } /* Deny write opens for read-only volumes. */ if (vol->v_read_only && acw > 0) { error = EROFS; goto out; } if (dcw == 0) g_raid_clean(vol, dcw); vol->v_provider_open += acr + acw + ace; /* Handle delayed node destruction. */ if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && vol->v_provider_open == 0) { /* Count open volumes. */ opens = g_raid_nopens(sc); if (opens == 0) { sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } } /* Handle open volume destruction. */ if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); out: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md) { struct g_raid_softc *sc; struct g_geom *gp; int error; g_topology_assert(); G_RAID_DEBUG(1, "Creating array %s.", name); gp = g_new_geomf(mp, "%s", name); sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); gp->start = g_raid_start; gp->orphan = g_raid_orphan; gp->access = g_raid_access; gp->dumpconf = g_raid_dumpconf; sc->sc_md = md; sc->sc_geom = gp; sc->sc_flags = 0; TAILQ_INIT(&sc->sc_volumes); TAILQ_INIT(&sc->sc_disks); sx_init(&sc->sc_lock, "graid:lock"); mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); bioq_init(&sc->sc_queue); gp->softc = sc; error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, "g_raid %s", name); if (error != 0) { G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_RAID); return (NULL); } G_RAID_DEBUG1(0, sc, "Array %s created.", name); return (sc); } struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) { struct g_raid_volume *vol, *vol1; int i; G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); vol->v_softc = sc; strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); vol->v_state = G_RAID_VOLUME_S_STARTING; vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; vol->v_rotate_parity = 1; bioq_init(&vol->v_inflight); bioq_init(&vol->v_locked); LIST_INIT(&vol->v_locks); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { vol->v_subdisks[i].sd_softc = sc; vol->v_subdisks[i].sd_volume = vol; vol->v_subdisks[i].sd_pos = i; vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; } /* Find free ID for this volume. */ g_topology_lock(); vol1 = vol; if (id >= 0) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } } if (vol1 != NULL) { for (id = 0; ; id++) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } if (vol1 == NULL) break; } } vol->v_global_id = id; LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); g_topology_unlock(); /* Delay root mounting. */ vol->v_rootmount = root_mount_hold("GRAID"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); vol->v_starting = 1; TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); return (vol); } struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc) { struct g_raid_disk *disk; G_RAID_DEBUG1(1, sc, "Creating disk."); disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); disk->d_softc = sc; disk->d_state = G_RAID_DISK_S_NONE; TAILQ_INIT(&disk->d_subdisks); TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); return (disk); } int g_raid_start_volume(struct g_raid_volume *vol) { struct g_raid_tr_class *class; struct g_raid_tr_object *obj; int status; G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { if (!class->trc_enable) continue; G_RAID_DEBUG1(2, vol->v_softc, "Tasting volume %s for %s transformation.", vol->v_name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->tro_class = class; obj->tro_volume = vol; status = G_RAID_TR_TASTE(obj, vol); if (status != G_RAID_TR_TASTE_FAIL) break; kobj_delete((kobj_t)obj, M_RAID); } if (class == NULL) { G_RAID_DEBUG1(0, vol->v_softc, "No transformation module found for %s.", vol->v_name); vol->v_tr = NULL; g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); return (-1); } G_RAID_DEBUG1(2, vol->v_softc, "Transformation module %s chosen for %s.", class->name, vol->v_name); vol->v_tr = obj; return (0); } int g_raid_destroy_node(struct g_raid_softc *sc, int worker) { struct g_raid_volume *vol, *tmpv; struct g_raid_disk *disk, *tmpd; int error = 0; sc->sc_stopping = G_RAID_DESTROY_HARD; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { if (g_raid_destroy_volume(vol)) error = EBUSY; } if (error) return (error); TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { if (g_raid_destroy_disk(disk)) error = EBUSY; } if (error) return (error); if (sc->sc_md) { G_RAID_MD_FREE(sc->sc_md); kobj_delete((kobj_t)sc->sc_md, M_RAID); sc->sc_md = NULL; } if (sc->sc_geom != NULL) { G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); g_topology_lock(); sc->sc_geom->softc = NULL; g_wither_geom(sc->sc_geom, ENXIO); g_topology_unlock(); sc->sc_geom = NULL; } else G_RAID_DEBUG(1, "Array destroyed."); if (worker) { g_raid_event_cancel(sc, sc); mtx_destroy(&sc->sc_queue_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); wakeup(&sc->sc_stopping); free(sc, M_RAID); curthread->td_pflags &= ~TDP_GEOM; G_RAID_DEBUG(1, "Thread exiting."); kproc_exit(0); } else { /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_volume(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_disk *disk; int i; sc = vol->v_softc; G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); vol->v_stopping = 1; if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { if (vol->v_tr) { G_RAID_TR_STOP(vol->v_tr); return (EBUSY); } else vol->v_state = G_RAID_VOLUME_S_STOPPED; } if (g_raid_event_check(sc, vol) != 0) return (EBUSY); if (vol->v_provider != NULL) return (EBUSY); if (vol->v_provider_open != 0) return (EBUSY); if (vol->v_tr) { G_RAID_TR_FREE(vol->v_tr); kobj_delete((kobj_t)vol->v_tr, M_RAID); vol->v_tr = NULL; } if (vol->v_rootmount) root_mount_rel(vol->v_rootmount); g_topology_lock(); LIST_REMOVE(vol, v_global_next); g_topology_unlock(); TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { g_raid_event_cancel(sc, &vol->v_subdisks[i]); disk = vol->v_subdisks[i].sd_disk; if (disk == NULL) continue; TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); } G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); if (sc->sc_md) G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); g_raid_event_cancel(sc, vol); free(vol, M_RAID); if (sc->sc_stopping == G_RAID_DESTROY_HARD) { /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmp; sc = disk->d_softc; G_RAID_DEBUG1(2, sc, "Destroying disk."); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); sd->sd_disk = NULL; } TAILQ_REMOVE(&sc->sc_disks, disk, d_next); if (sc->sc_md) G_RAID_MD_FREE_DISK(sc->sc_md, disk); g_raid_event_cancel(sc, disk); free(disk, M_RAID); return (0); } int g_raid_destroy(struct g_raid_softc *sc, int how) { int error, opens; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); /* Count open volumes. */ opens = g_raid_nopens(sc); /* React on some opened volumes. */ if (opens > 0) { switch (how) { case G_RAID_DESTROY_SOFT: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_DELAYED: G_RAID_DEBUG1(1, sc, "Array will be destroyed on last close."); sc->sc_stopping = G_RAID_DESTROY_DELAYED; sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_HARD: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); } } /* Mark node for destruction. */ sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); /* Sleep until node destroyed. */ error = sx_sleep(&sc->sc_stopping, &sc->sc_lock, PRIBIO | PDROP, "r:destroy", hz * 3); return (error == EWOULDBLOCK ? EBUSY : 0); } static void g_raid_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp, *geom; struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); if (!g_raid_enable) return (NULL); G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); geom = NULL; status = G_RAID_MD_TASTE_FAIL; gp = g_new_geomf(mp, "raid:taste"); /* * This orphan function should be never called. */ gp->orphan = g_raid_taste_orphan; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_RECEIVE; g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) goto ofail; LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (!class->mdc_enable) continue; G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", pp->name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_TASTE(obj, mp, cp, &geom); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); if (status != G_RAID_MD_TASTE_FAIL) break; } if (status == G_RAID_MD_TASTE_FAIL) (void)g_access(cp, -1, 0, 0); ofail: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); return (geom); } int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp) { struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; G_RAID_DEBUG(2, "Creating array for %s metadata.", format); LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (strcasecmp(class->name, format) == 0) break; } if (class == NULL) { G_RAID_DEBUG(1, "No support for %s metadata.", format); return (G_RAID_MD_TASTE_FAIL); } obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); return (status); } static int g_raid_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); g_topology_lock(); return (error); } void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_stopping == G_RAID_DESTROY_HARD) return; if (sc->sc_md) G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); } void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (disk == NULL) disk = sd->sd_disk; if (disk == NULL) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); return; } if (disk->d_state != G_RAID_DISK_S_ACTIVE) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); return; } if (sc->sc_md) G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); } static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, s; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { vol = pp->private; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s %s volume\n", indent, sc->sc_md->mdo_class->name, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s\n", indent, vol->v_name); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s%s\n", indent, vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); sbuf_printf(sb, "%s%u\n", indent, vol->v_disks_count); sbuf_printf(sb, "%s%u\n", indent, vol->v_strip_size); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(vol->v_state)); sbuf_printf(sb, "%s%s\n", indent, vol->v_dirty ? "Yes" : "No"); sbuf_printf(sb, "%s", indent); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk != NULL && sd->sd_disk->d_consumer != NULL) { sbuf_printf(sb, "%s ", g_raid_get_diskname(sd->sd_disk)); } else { sbuf_cat(sb, "NONE "); } sbuf_printf(sb, "(%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } sbuf_cat(sb, ")"); if (i + 1 < vol->v_disks_count) sbuf_cat(sb, ", "); } sbuf_cat(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else if (cp != NULL) { disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s", indent, g_raid_disk_state2str(disk->d_state)); if (!TAILQ_EMPTY(&disk->d_subdisks)) { sbuf_cat(sb, " ("); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } if (TAILQ_NEXT(sd, sd_next)) sbuf_cat(sb, ", "); } sbuf_cat(sb, ")"); } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s", indent); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "r%d(%s):%d@%ju", sd->sd_volume->v_global_id, sd->sd_volume->v_name, sd->sd_pos, (uintmax_t)sd->sd_offset); if (TAILQ_NEXT(sd, sd_next)) sbuf_cat(sb, ", "); } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%d\n", indent, disk->d_read_errs); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_md->mdo_class->name); } if (!TAILQ_EMPTY(&sc->sc_volumes)) { s = 0xff; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_state < s) s = vol->v_state; } sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(s)); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid_softc *sc; struct g_raid_volume *vol; mp = arg; g_topology_lock(); g_raid_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) g_raid_clean(vol, -1); g_cancel_event(sc); g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); g_topology_lock(); } g_topology_unlock(); } static void g_raid_init(struct g_class *mp) { g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid_post_sync == NULL) G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); g_raid_started = 1; } static void g_raid_fini(struct g_class *mp) { if (g_raid_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync); g_raid_started = 0; } int g_raid_md_modevent(module_t mod, int type, void *arg) { struct g_raid_md_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_md_classes); if (c == NULL || c->mdc_priority > class->mdc_priority) LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); else { while ((nc = LIST_NEXT(c, mdc_list)) != NULL && nc->mdc_priority < class->mdc_priority) c = nc; LIST_INSERT_AFTER(c, class, mdc_list); } if (g_raid_started) g_retaste(&g_raid_class); break; case MOD_UNLOAD: LIST_REMOVE(class, mdc_list); break; default: error = EOPNOTSUPP; break; } return (error); } int g_raid_tr_modevent(module_t mod, int type, void *arg) { struct g_raid_tr_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_tr_classes); if (c == NULL || c->trc_priority > class->trc_priority) LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); else { while ((nc = LIST_NEXT(c, trc_list)) != NULL && nc->trc_priority < class->trc_priority) c = nc; LIST_INSERT_AFTER(c, class, trc_list); } break; case MOD_UNLOAD: LIST_REMOVE(class, trc_list); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) * to reduce module priority, allowing submodules to register them first. */ static moduledata_t g_raid_mod = { "g_raid", g_modevent, &g_raid_class }; DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); MODULE_VERSION(geom_raid, 0); Index: head/sys/geom/raid/g_raid.h =================================================================== --- head/sys/geom/raid/g_raid.h (revision 350693) +++ head/sys/geom/raid/g_raid.h (revision 350694) @@ -1,471 +1,445 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_RAID_H_ #define _G_RAID_H_ #include #include #include #include #ifdef _KERNEL #include #endif #define G_RAID_CLASS_NAME "RAID" #define G_RAID_MAGIC "GEOM::RAID" #define G_RAID_VERSION 0 struct g_raid_md_object; struct g_raid_tr_object; #define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL #define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \ G_RAID_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern u_int g_raid_aggressive_spare; extern u_int g_raid_debug; extern int g_raid_enable; extern int g_raid_read_err_thresh; extern u_int g_raid_start_timeout; extern struct g_class g_raid_class; -#define G_RAID_DEBUG(lvl, fmt, ...) do { \ - if (g_raid_debug >= (lvl)) { \ - if (g_raid_debug > 0) { \ - printf("GEOM_RAID[%u]: " fmt "\n", \ - lvl, ## __VA_ARGS__); \ - } else { \ - printf("GEOM_RAID: " fmt "\n", \ - ## __VA_ARGS__); \ - } \ - } \ -} while (0) -#define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \ - if (g_raid_debug >= (lvl)) { \ - if (g_raid_debug > 0) { \ - printf("GEOM_RAID[%u]: %s: " fmt "\n", \ - lvl, (sc)->sc_name, ## __VA_ARGS__); \ - } else { \ - printf("GEOM_RAID: %s: " fmt "\n", \ - (sc)->sc_name, ## __VA_ARGS__); \ - } \ - } \ -} while (0) -#define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \ - if (g_raid_debug >= (lvl)) { \ - if (g_raid_debug > 0) { \ - printf("GEOM_RAID[%u]: " fmt " ", \ - lvl, ## __VA_ARGS__); \ - } else \ - printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_RAID_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), NULL, __VA_ARGS__) +#define G_RAID_DEBUG1(lvl, sc, fmt, ...) \ + _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), NULL, "%s: " fmt, \ + (sc)->sc_name, ## __VA_ARGS__) +#define G_RAID_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), (bp), __VA_ARGS__) /* * Flags we use to distinguish I/O initiated by the TR layer to maintain * the volume's characteristics, fix subdisks, extra copies of data, etc. * * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data * for RAID volumes that maintain extra data * and need to rebuild that data. * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into * doing some desirable action such as bad * block remapping after we detect a bad part * of the disk. * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released. * * and the following meta item: * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it * through the range locking which would * otherwise defer the I/O until after that * range is unlocked. */ #define G_RAID_BIO_FLAG_SYNC 0x01 #define G_RAID_BIO_FLAG_REMAP 0x02 #define G_RAID_BIO_FLAG_SPECIAL \ (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) #define G_RAID_BIO_FLAG_LOCKED 0x80 struct g_raid_lock { off_t l_offset; off_t l_length; void *l_callback_arg; int l_pending; LIST_ENTRY(g_raid_lock) l_next; }; #define G_RAID_EVENT_WAIT 0x01 #define G_RAID_EVENT_VOLUME 0x02 #define G_RAID_EVENT_SUBDISK 0x04 #define G_RAID_EVENT_DISK 0x08 #define G_RAID_EVENT_DONE 0x10 struct g_raid_event { void *e_tgt; int e_event; int e_flags; int e_error; TAILQ_ENTRY(g_raid_event) e_next; }; #define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */ #define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */ #define G_RAID_DISK_S_DISABLED 0x02 /* Disabled. */ #define G_RAID_DISK_S_FAILED 0x03 /* Failed. */ #define G_RAID_DISK_S_STALE_FAILED 0x04 /* Old failed. */ #define G_RAID_DISK_S_SPARE 0x05 /* Hot-spare. */ #define G_RAID_DISK_S_STALE 0x06 /* Old disk, unused now. */ #define G_RAID_DISK_S_ACTIVE 0x07 /* Operational. */ #define G_RAID_DISK_E_DISCONNECTED 0x01 struct g_raid_disk { struct g_raid_softc *d_softc; /* Back-pointer to softc. */ struct g_consumer *d_consumer; /* GEOM disk consumer. */ void *d_md_data; /* Disk's metadata storage. */ int d_candelete; /* BIO_DELETE supported. */ uint64_t d_flags; /* Additional flags. */ u_int d_state; /* Disk state. */ u_int d_load; /* Disk average load. */ off_t d_last_offset; /* Last head offset. */ int d_read_errs; /* Count of the read errors */ TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */ struct g_kerneldump d_kd; /* Kernel dumping method/args. */ }; #define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */ #define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */ #define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */ #define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */ #define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */ #define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */ #define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */ #define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */ #define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */ #define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */ #define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ #define G_RAID_SUBDISK_POS(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) #define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) #define G_RAID_SUBDISK_LOAD(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) #define G_RAID_SUBDISK_LOAD_SCALE 256 struct g_raid_subdisk { struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */ off_t sd_offset; /* Offset on the disk. */ off_t sd_size; /* Size on the disk. */ u_int sd_pos; /* Position in volume. */ u_int sd_state; /* Subdisk state. */ off_t sd_rebuild_pos; /* Rebuild position. */ int sd_recovery; /* Count of recovery reqs. */ TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ }; #define G_RAID_MAX_SUBDISKS 16 #define G_RAID_MAX_VOLUMENAME 32 #define G_RAID_VOLUME_S_STARTING 0x00 #define G_RAID_VOLUME_S_BROKEN 0x01 #define G_RAID_VOLUME_S_DEGRADED 0x02 #define G_RAID_VOLUME_S_SUBOPTIMAL 0x03 #define G_RAID_VOLUME_S_OPTIMAL 0x04 #define G_RAID_VOLUME_S_UNSUPPORTED 0x05 #define G_RAID_VOLUME_S_STOPPED 0x06 #define G_RAID_VOLUME_S_ALIVE(s) \ ((s) == G_RAID_VOLUME_S_DEGRADED || \ (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \ (s) == G_RAID_VOLUME_S_OPTIMAL) #define G_RAID_VOLUME_E_DOWN 0x00 #define G_RAID_VOLUME_E_UP 0x01 #define G_RAID_VOLUME_E_START 0x10 #define G_RAID_VOLUME_E_STARTMD 0x11 #define G_RAID_VOLUME_RL_RAID0 0x00 #define G_RAID_VOLUME_RL_RAID1 0x01 #define G_RAID_VOLUME_RL_RAID3 0x03 #define G_RAID_VOLUME_RL_RAID4 0x04 #define G_RAID_VOLUME_RL_RAID5 0x05 #define G_RAID_VOLUME_RL_RAID6 0x06 #define G_RAID_VOLUME_RL_RAIDMDF 0x07 #define G_RAID_VOLUME_RL_RAID1E 0x11 #define G_RAID_VOLUME_RL_SINGLE 0x0f #define G_RAID_VOLUME_RL_CONCAT 0x1f #define G_RAID_VOLUME_RL_RAID5E 0x15 #define G_RAID_VOLUME_RL_RAID5EE 0x25 #define G_RAID_VOLUME_RL_RAID5R 0x35 #define G_RAID_VOLUME_RL_UNKNOWN 0xff #define G_RAID_VOLUME_RLQ_NONE 0x00 #define G_RAID_VOLUME_RLQ_R1SM 0x00 #define G_RAID_VOLUME_RLQ_R1MM 0x01 #define G_RAID_VOLUME_RLQ_R3P0 0x00 #define G_RAID_VOLUME_RLQ_R3PN 0x01 #define G_RAID_VOLUME_RLQ_R4P0 0x00 #define G_RAID_VOLUME_RLQ_R4PN 0x01 #define G_RAID_VOLUME_RLQ_R5RA 0x00 #define G_RAID_VOLUME_RLQ_R5RS 0x01 #define G_RAID_VOLUME_RLQ_R5LA 0x02 #define G_RAID_VOLUME_RLQ_R5LS 0x03 #define G_RAID_VOLUME_RLQ_R6RA 0x00 #define G_RAID_VOLUME_RLQ_R6RS 0x01 #define G_RAID_VOLUME_RLQ_R6LA 0x02 #define G_RAID_VOLUME_RLQ_R6LS 0x03 #define G_RAID_VOLUME_RLQ_RMDFRA 0x00 #define G_RAID_VOLUME_RLQ_RMDFRS 0x01 #define G_RAID_VOLUME_RLQ_RMDFLA 0x02 #define G_RAID_VOLUME_RLQ_RMDFLS 0x03 #define G_RAID_VOLUME_RLQ_R1EA 0x00 #define G_RAID_VOLUME_RLQ_R1EO 0x01 #define G_RAID_VOLUME_RLQ_R5ERA 0x00 #define G_RAID_VOLUME_RLQ_R5ERS 0x01 #define G_RAID_VOLUME_RLQ_R5ELA 0x02 #define G_RAID_VOLUME_RLQ_R5ELS 0x03 #define G_RAID_VOLUME_RLQ_R5EERA 0x00 #define G_RAID_VOLUME_RLQ_R5EERS 0x01 #define G_RAID_VOLUME_RLQ_R5EELA 0x02 #define G_RAID_VOLUME_RLQ_R5EELS 0x03 #define G_RAID_VOLUME_RLQ_R5RRA 0x00 #define G_RAID_VOLUME_RLQ_R5RRS 0x01 #define G_RAID_VOLUME_RLQ_R5RLA 0x02 #define G_RAID_VOLUME_RLQ_R5RLS 0x03 #define G_RAID_VOLUME_RLQ_UNKNOWN 0xff struct g_raid_volume; struct g_raid_volume { struct g_raid_softc *v_softc; /* Back-pointer to softc. */ struct g_provider *v_provider; /* GEOM provider. */ struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS]; /* Subdisks of this volume. */ void *v_md_data; /* Volume's metadata storage. */ struct g_raid_tr_object *v_tr; /* Transformation object. */ char v_name[G_RAID_MAX_VOLUMENAME]; /* Volume name. */ u_int v_state; /* Volume state. */ u_int v_raid_level; /* Array RAID level. */ u_int v_raid_level_qualifier; /* RAID level det. */ u_int v_disks_count; /* Number of disks in array. */ u_int v_mdf_pdisks; /* Number of parity disks in RAIDMDF array. */ uint16_t v_mdf_polynomial; /* Polynomial for RAIDMDF. */ uint8_t v_mdf_method; /* Generation method for RAIDMDF. */ u_int v_strip_size; /* Array strip size. */ u_int v_rotate_parity; /* Rotate RAID5R parity after numer of stripes. */ u_int v_sectorsize; /* Volume sector size. */ off_t v_mediasize; /* Volume media size. */ struct bio_queue_head v_inflight; /* In-flight write requests. */ struct bio_queue_head v_locked; /* Blocked I/O requests. */ LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */ int v_pending_lock; /* writes to locked region */ int v_dirty; /* Volume is DIRTY. */ struct timeval v_last_done; /* Time of the last I/O. */ time_t v_last_write; /* Time of the last write. */ u_int v_writes; /* Number of active writes. */ struct root_hold_token *v_rootmount; /* Root mount delay token. */ int v_starting; /* Volume is starting */ int v_stopping; /* Volume is stopping */ int v_provider_open; /* Number of opens. */ int v_global_id; /* Global volume ID (rX). */ int v_read_only; /* Volume is read-only. */ TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */ LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */ }; #define G_RAID_NODE_E_WAKE 0x00 #define G_RAID_NODE_E_START 0x01 struct g_raid_softc { struct g_raid_md_object *sc_md; /* Metadata object. */ struct g_geom *sc_geom; /* GEOM class instance. */ uint64_t sc_flags; /* Additional flags. */ TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */ TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */ struct sx sc_lock; /* Main node lock. */ struct proc *sc_worker; /* Worker process. */ struct mtx sc_queue_mtx; /* Worker queues lock. */ TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */ struct bio_queue_head sc_queue; /* Worker I/O queue. */ int sc_stopping; /* Node is stopping */ }; #define sc_name sc_geom->name SYSCTL_DECL(_kern_geom_raid); /* * KOBJ parent class of metadata processing modules. */ struct g_raid_md_class { KOBJ_CLASS_FIELDS; int mdc_enable; int mdc_priority; LIST_ENTRY(g_raid_md_class) mdc_list; }; /* * KOBJ instance of metadata processing module. */ struct g_raid_md_object { KOBJ_FIELDS; struct g_raid_md_class *mdo_class; struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */ }; int g_raid_md_modevent(module_t, int, void *); #define G_RAID_MD_DECLARE(name, label) \ static moduledata_t g_raid_md_##name##_mod = { \ "g_raid_md_" __XSTRING(name), \ g_raid_md_modevent, \ &g_raid_md_##name##_class \ }; \ DECLARE_MODULE(g_raid_md_##name, g_raid_md_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_SECOND); \ MODULE_DEPEND(g_raid_md_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " metadata module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RWTUN, &g_raid_md_##name##_class.mdc_enable, 0, \ "Enable " label " metadata format taste") /* * KOBJ parent class of data transformation modules. */ struct g_raid_tr_class { KOBJ_CLASS_FIELDS; int trc_enable; int trc_priority; int trc_accept_unmapped; LIST_ENTRY(g_raid_tr_class) trc_list; }; /* * KOBJ instance of data transformation module. */ struct g_raid_tr_object { KOBJ_FIELDS; struct g_raid_tr_class *tro_class; struct g_raid_volume *tro_volume; /* Back-pointer to volume. */ }; int g_raid_tr_modevent(module_t, int, void *); #define G_RAID_TR_DECLARE(name, label) \ static moduledata_t g_raid_tr_##name##_mod = { \ "g_raid_tr_" __XSTRING(name), \ g_raid_tr_modevent, \ &g_raid_tr_##name##_class \ }; \ DECLARE_MODULE(g_raid_tr_##name, g_raid_tr_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_FIRST); \ MODULE_DEPEND(g_raid_tr_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " transformation module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RWTUN, &g_raid_tr_##name##_class.trc_enable, 0, \ "Enable " label " transformation module taste") const char * g_raid_volume_level2str(int level, int qual); int g_raid_volume_str2level(const char *str, int *level, int *qual); const char * g_raid_volume_state2str(int state); const char * g_raid_subdisk_state2str(int state); const char * g_raid_disk_state2str(int state); struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md); int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp); struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id); struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc); const char * g_raid_get_diskname(struct g_raid_disk *disk); void g_raid_get_disk_info(struct g_raid_disk *disk); int g_raid_start_volume(struct g_raid_volume *vol); int g_raid_destroy_node(struct g_raid_softc *sc, int worker); int g_raid_destroy_volume(struct g_raid_volume *vol); int g_raid_destroy_disk(struct g_raid_disk *disk); void g_raid_iodone(struct bio *bp, int error); void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp); int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length); struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc, const char *name); void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp); void g_raid_report_disk_state(struct g_raid_disk *disk); void g_raid_change_disk_state(struct g_raid_disk *disk, int state); void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state); void g_raid_change_volume_state(struct g_raid_volume *vol, int state); void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp); int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length); u_int g_raid_ndisks(struct g_raid_softc *sc, int state); u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state); u_int g_raid_nopens(struct g_raid_softc *sc); struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state); #define G_RAID_DESTROY_SOFT 0 #define G_RAID_DESTROY_DELAYED 1 #define G_RAID_DESTROY_HARD 2 int g_raid_destroy(struct g_raid_softc *sc, int how); int g_raid_event_send(void *arg, int event, int flags); int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp); int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len); g_ctl_req_t g_raid_ctl; #endif /* _KERNEL */ #endif /* !_G_RAID_H_ */ Index: head/sys/geom/raid/md_ddf.c =================================================================== --- head/sys/geom/raid/md_ddf.c (revision 350693) +++ head/sys/geom/raid/md_ddf.c (revision 350694) @@ -1,3089 +1,3090 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "geom/raid/md_ddf.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata"); #define DDF_MAX_DISKS_HARD 128 #define DDF_MAX_DISKS 16 #define DDF_MAX_VDISKS 7 #define DDF_MAX_PARTITIONS 1 #define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */ struct ddf_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_pd_record *pdr; struct ddf_vd_record *vdr; void *cr; struct ddf_pdd_record *pdd; struct ddf_bbm_log *bbm; }; struct ddf_vol_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD]; }; struct g_raid_md_ddf_perdisk { struct ddf_meta pd_meta; }; struct g_raid_md_ddf_pervolume { struct ddf_vol_meta pv_meta; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; struct g_raid_md_ddf_object { struct g_raid_md_object mdio_base; u_int mdio_bigendian; struct ddf_meta mdio_meta; int mdio_starting; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_started; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_req_t g_raid_md_create_req_ddf; static g_raid_md_taste_t g_raid_md_taste_ddf; static g_raid_md_event_t g_raid_md_event_ddf; static g_raid_md_volume_event_t g_raid_md_volume_event_ddf; static g_raid_md_ctl_t g_raid_md_ctl_ddf; static g_raid_md_write_t g_raid_md_write_ddf; static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf; static g_raid_md_free_disk_t g_raid_md_free_disk_ddf; static g_raid_md_free_volume_t g_raid_md_free_volume_ddf; static g_raid_md_free_t g_raid_md_free_ddf; static kobj_method_t g_raid_md_ddf_methods[] = { KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf), KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf), KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf), KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf), { 0, 0 } }; static struct g_raid_md_class g_raid_md_ddf_class = { "DDF", g_raid_md_ddf_methods, sizeof(struct g_raid_md_ddf_object), .mdc_enable = 1, .mdc_priority = 100 }; #define GET8(m, f) ((m)->f) #define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f)) #define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f)) #define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f)) #define GET8D(m, f) (f) #define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f)) #define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f)) #define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f)) #define GET8P(m, f) (*(f)) #define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f)) #define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f)) #define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f)) #define SET8P(m, f, v) \ (*(f) = (v)) #define SET16P(m, f, v) \ do { \ if ((m)->bigendian) \ be16enc((f), (v)); \ else \ le16enc((f), (v)); \ } while (0) #define SET32P(m, f, v) \ do { \ if ((m)->bigendian) \ be32enc((f), (v)); \ else \ le32enc((f), (v)); \ } while (0) #define SET64P(m, f, v) \ do { \ if ((m)->bigendian) \ be64enc((f), (v)); \ else \ le64enc((f), (v)); \ } while (0) #define SET8(m, f, v) SET8P((m), &((m)->f), (v)) #define SET16(m, f, v) SET16P((m), &((m)->f), (v)) #define SET32(m, f, v) SET32P((m), &((m)->f), (v)) #define SET64(m, f, v) SET64P((m), &((m)->f), (v)) #define SET8D(m, f, v) SET8P((m), &(f), (v)) #define SET16D(m, f, v) SET16P((m), &(f), (v)) #define SET32D(m, f, v) SET32P((m), &(f), (v)) #define SET64D(m, f, v) SET64P((m), &(f), (v)) #define GETCRNUM(m) (GET32((m), hdr->cr_length) / \ GET16((m), hdr->Configuration_Record_Length)) #define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) #define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) static int isff(uint8_t *buf, int size) { int i; for (i = 0; i < size; i++) if (buf[i] != 0xff) return (0); return (1); } static void print_guid(uint8_t *buf) { int i, ascii; ascii = 1; for (i = 0; i < 24; i++) { if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) { ascii = 0; break; } } if (ascii) { printf("'%.24s'", buf); } else { for (i = 0; i < 24; i++) printf("%02x", buf[i]); } } static void g_raid_md_ddf_print(struct ddf_meta *meta) { struct ddf_vdc_record *vdc; struct ddf_vuc_record *vuc; struct ddf_sa_record *sa; uint64_t *val2; uint32_t val; int i, j, k, num, num2; if (g_raid_debug < 1) return; printf("********* DDF Metadata *********\n"); printf("**** Header ****\n"); printf("DDF_Header_GUID "); print_guid(meta->hdr->DDF_Header_GUID); printf("\n"); printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]); printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number)); printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp)); printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag)); printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag)); printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping)); printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA)); printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA)); printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length)); printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA)); printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries)); printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries)); printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions)); printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length)); printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries)); printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length)); printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length)); printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length)); printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length)); printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length)); printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length)); printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length)); printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length)); printf("**** Controller Data ****\n"); printf("Controller_GUID "); print_guid(meta->cdr->Controller_GUID); printf("\n"); printf("Controller_Type 0x%04x%04x 0x%04x%04x\n", GET16(meta, cdr->Controller_Type.Vendor_ID), GET16(meta, cdr->Controller_Type.Device_ID), GET16(meta, cdr->Controller_Type.SubVendor_ID), GET16(meta, cdr->Controller_Type.SubDevice_ID)); printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]); printf("**** Physical Disk Records ****\n"); printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs)); printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported)); for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) { if (isff(meta->pdr->entry[j].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff) continue; printf("PD_GUID "); print_guid(meta->pdr->entry[j].PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdr->entry[j].PD_Reference)); printf("PD_Type 0x%04x\n", GET16(meta, pdr->entry[j].PD_Type)); printf("PD_State 0x%04x\n", GET16(meta, pdr->entry[j].PD_State)); printf("Configured_Size %ju\n", GET64(meta, pdr->entry[j].Configured_Size)); printf("Block_Size %u\n", GET16(meta, pdr->entry[j].Block_Size)); } printf("**** Virtual Disk Records ****\n"); printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs)); printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported)); for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) { if (isff(meta->vdr->entry[j].VD_GUID, 24)) continue; printf("VD_GUID "); print_guid(meta->vdr->entry[j].VD_GUID); printf("\n"); printf("VD_Number 0x%04x\n", GET16(meta, vdr->entry[j].VD_Number)); printf("VD_Type 0x%04x\n", GET16(meta, vdr->entry[j].VD_Type)); printf("VD_State 0x%02x\n", GET8(meta, vdr->entry[j].VD_State)); printf("Init_State 0x%02x\n", GET8(meta, vdr->entry[j].Init_State)); printf("Drive_Failures_Remaining %u\n", GET8(meta, vdr->entry[j].Drive_Failures_Remaining)); printf("VD_Name '%.16s'\n", (char *)&meta->vdr->entry[j].VD_Name); } printf("**** Configuration Records ****\n"); num = GETCRNUM(meta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(meta, j); val = GET32D(meta, vdc->Signature); switch (val) { case DDF_VDCR_SIGNATURE: printf("** Virtual Disk Configuration **\n"); printf("VD_GUID "); print_guid(vdc->VD_GUID); printf("\n"); printf("Timestamp 0x%08x\n", GET32D(meta, vdc->Timestamp)); printf("Sequence_Number 0x%08x\n", GET32D(meta, vdc->Sequence_Number)); printf("Primary_Element_Count %u\n", GET16D(meta, vdc->Primary_Element_Count)); printf("Stripe_Size %u\n", GET8D(meta, vdc->Stripe_Size)); printf("Primary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Primary_RAID_Level)); printf("RLQ 0x%02x\n", GET8D(meta, vdc->RLQ)); printf("Secondary_Element_Count %u\n", GET8D(meta, vdc->Secondary_Element_Count)); printf("Secondary_Element_Seq %u\n", GET8D(meta, vdc->Secondary_Element_Seq)); printf("Secondary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Secondary_RAID_Level)); printf("Block_Count %ju\n", GET64D(meta, vdc->Block_Count)); printf("VD_Size %ju\n", GET64D(meta, vdc->VD_Size)); printf("Block_Size %u\n", GET16D(meta, vdc->Block_Size)); printf("Rotate_Parity_count %u\n", GET8D(meta, vdc->Rotate_Parity_count)); printf("Associated_Spare_Disks"); for (i = 0; i < 8; i++) { if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff) printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i])); } printf("\n"); printf("Cache_Flags %016jx\n", GET64D(meta, vdc->Cache_Flags)); printf("BG_Rate %u\n", GET8D(meta, vdc->BG_Rate)); printf("MDF_Parity_Disks %u\n", GET8D(meta, vdc->MDF_Parity_Disks)); printf("MDF_Parity_Generator_Polynomial 0x%04x\n", GET16D(meta, vdc->MDF_Parity_Generator_Polynomial)); printf("MDF_Constant_Generation_Method 0x%02x\n", GET8D(meta, vdc->MDF_Constant_Generation_Method)); printf("Physical_Disks "); num2 = GET16D(meta, vdc->Primary_Element_Count); val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]); for (i = 0; i < num2; i++) printf(" 0x%08x @ %ju", GET32D(meta, vdc->Physical_Disk_Sequence[i]), GET64P(meta, val2 + i)); printf("\n"); break; case DDF_VUCR_SIGNATURE: printf("** Vendor Unique Configuration **\n"); vuc = (struct ddf_vuc_record *)vdc; printf("VD_GUID "); print_guid(vuc->VD_GUID); printf("\n"); break; case DDF_SA_SIGNATURE: printf("** Spare Assignment Configuration **\n"); sa = (struct ddf_sa_record *)vdc; printf("Timestamp 0x%08x\n", GET32D(meta, sa->Timestamp)); printf("Spare_Type 0x%02x\n", GET8D(meta, sa->Spare_Type)); printf("Populated_SAEs %u\n", GET16D(meta, sa->Populated_SAEs)); printf("MAX_SAE_Supported %u\n", GET16D(meta, sa->MAX_SAE_Supported)); for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) { if (isff(sa->entry[i].VD_GUID, 24)) continue; printf("VD_GUID "); for (k = 0; k < 24; k++) printf("%02x", sa->entry[i].VD_GUID[k]); printf("\n"); printf("Secondary_Element %u\n", GET16D(meta, sa->entry[i].Secondary_Element)); } break; case 0x00000000: case 0xFFFFFFFF: break; default: printf("Unknown configuration signature %08x\n", val); break; } } printf("**** Physical Disk Data ****\n"); printf("PD_GUID "); print_guid(meta->pdd->PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdd->PD_Reference)); printf("Forced_Ref_Flag 0x%02x\n", GET8(meta, pdd->Forced_Ref_Flag)); printf("Forced_PD_GUID_Flag 0x%02x\n", GET8(meta, pdd->Forced_PD_GUID_Flag)); } static int ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference) { int i; for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (GUID != NULL) { if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0) return (i); } else if (PD_Reference != 0xffffffff) { if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference) return (i); } else if (isff(meta->pdr->entry[i].PD_GUID, 24)) return (i); } if (GUID == NULL && PD_Reference == 0xffffffff) { if (i >= GET16(meta, pdr->Max_PDE_Supported)) return (-1); SET16(meta, pdr->Populated_PDEs, i + 1); return (i); } return (-1); } static int ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID) { int i; for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) { if (GUID != NULL) { if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0) return (i); } else if (isff(meta->vdr->entry[i].VD_GUID, 24)) return (i); } if (GUID == NULL) { if (i >= GET16(meta, vdr->Max_VDE_Supported)) return (-1); SET16(meta, vdr->Populated_VDEs, i + 1); return (i); } return (-1); } static struct ddf_vdc_record * ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GUID != NULL) { if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE && memcmp(vdc->VD_GUID, GUID, 24) == 0) return (vdc); } else if (GET32D(meta, vdc->Signature) == 0xffffffff || GET32D(meta, vdc->Signature) == 0) return (vdc); } return (NULL); } static int ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num, cnt; cnt = 0; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0) cnt++; } return (cnt); } static int ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference, int *bvdp, int *posp) { int i, bvd, pos; i = 0; for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) { if (vmeta->bvdc[bvd] == NULL) { i += GET16(vmeta, vdc->Primary_Element_Count); // XXX continue; } for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count); pos++, i++) { if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) == PD_Reference) { if (bvdp != NULL) *bvdp = bvd; if (posp != NULL) *posp = pos; return (i); } } } return (-1); } static struct ddf_sa_record * ddf_meta_find_sa(struct ddf_meta *meta, int create) { struct ddf_sa_record *sa; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE) return (sa); } if (create) { for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == 0xffffffff || GET32D(meta, sa->Signature) == 0) return (sa); } } return (NULL); } static void ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct ddf_meta *meta; struct ddf_pd_entry *pde; off_t anchorlba; u_int ss, pos, size; int len, error; char serial_buffer[DISK_IDENT_SIZE]; if (sample->hdr == NULL) sample = NULL; mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; ss = disk->d_consumer->provider->sectorsize; anchorlba = disk->d_consumer->provider->mediasize / ss - 1; meta->sectorsize = ss; meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian; getnanotime(&ts); clock_ts_to_ct(&ts, &ct); /* Header */ meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memset(meta->hdr, 0xff, ss); if (sample) { memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header)); if (ss != sample->sectorsize) { SET32(meta, hdr->WorkSpace_Length, howmany(GET32(sample, hdr->WorkSpace_Length) * sample->sectorsize, ss)); SET16(meta, hdr->Configuration_Record_Length, howmany(GET16(sample, hdr->Configuration_Record_Length) * sample->sectorsize, ss)); SET32(meta, hdr->cd_length, howmany(GET32(sample, hdr->cd_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdr_length, howmany(GET32(sample, hdr->pdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->vdr_length, howmany(GET32(sample, hdr->vdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->cr_length, howmany(GET32(sample, hdr->cr_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdd_length, howmany(GET32(sample, hdr->pdd_length) * sample->sectorsize, ss)); SET32(meta, hdr->bbmlog_length, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Diagnostic_Space, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Vendor_Specific_Logs, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); } } else { SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE); snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x", (u_int)(ts.tv_sec - DECADE), arc4random()); memcpy(meta->hdr->DDF_rev, "02.00.00", 8); SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE)); SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss); SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1); SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS); SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS); SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS); SET16(meta, hdr->Configuration_Record_Length, howmany(sizeof(struct ddf_vdc_record) + (4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries), ss)); SET32(meta, hdr->cd_length, howmany(sizeof(struct ddf_cd_record), ss)); SET32(meta, hdr->pdr_length, howmany(sizeof(struct ddf_pd_record) + sizeof(struct ddf_pd_entry) * GET16(meta, hdr->Max_PD_Entries), ss)); SET32(meta, hdr->vdr_length, howmany(sizeof(struct ddf_vd_record) + sizeof(struct ddf_vd_entry) * GET16(meta, hdr->Max_VD_Entries), ss)); SET32(meta, hdr->cr_length, GET16(meta, hdr->Configuration_Record_Length) * (GET16(meta, hdr->Max_Partitions) + 1)); SET32(meta, hdr->pdd_length, howmany(sizeof(struct ddf_pdd_record), ss)); SET32(meta, hdr->bbmlog_length, 0); SET32(meta, hdr->Diagnostic_Space_Length, 0); SET32(meta, hdr->Vendor_Specific_Logs_Length, 0); } pos = 1; SET32(meta, hdr->cd_section, pos); pos += GET32(meta, hdr->cd_length); SET32(meta, hdr->pdr_section, pos); pos += GET32(meta, hdr->pdr_length); SET32(meta, hdr->vdr_section, pos); pos += GET32(meta, hdr->vdr_length); SET32(meta, hdr->cr_section, pos); pos += GET32(meta, hdr->cr_length); SET32(meta, hdr->pdd_section, pos); pos += GET32(meta, hdr->pdd_length); SET32(meta, hdr->bbmlog_section, GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->bbmlog_length); SET32(meta, hdr->Diagnostic_Space, GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->Diagnostic_Space_Length); SET32(meta, hdr->Vendor_Specific_Logs, GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff); pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1); SET64(meta, hdr->Primary_Header_LBA, anchorlba - pos); SET64(meta, hdr->Secondary_Header_LBA, 0xffffffffffffffffULL); SET64(meta, hdr->WorkSpace_LBA, anchorlba + 1 - 32 * 1024 * 1024 / ss); /* Controller Data */ size = GET32(meta, hdr->cd_length) * ss; meta->cdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cdr, 0xff, size); SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE); memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24); memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16); /* Physical Drive Records. */ size = GET32(meta, hdr->pdr_length) * ss; meta->pdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdr, 0xff, size); SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE); SET16(meta, pdr->Populated_PDEs, 1); SET16(meta, pdr->Max_PDE_Supported, GET16(meta, hdr->Max_PD_Entries)); pde = &meta->pdr->entry[0]; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer); if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20) snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer); else snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xffff); SET32D(meta, pde->PD_Reference, arc4random()); SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE); SET16D(meta, pde->PD_State, 0); SET64D(meta, pde->Configured_Size, anchorlba + 1 - 32 * 1024 * 1024 / ss); SET16D(meta, pde->Block_Size, ss); /* Virtual Drive Records. */ size = GET32(meta, hdr->vdr_length) * ss; meta->vdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdr, 0xff, size); SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE); SET32(meta, vdr->Populated_VDEs, 0); SET16(meta, vdr->Max_VDE_Supported, GET16(meta, hdr->Max_VD_Entries)); /* Configuration Records. */ size = GET32(meta, hdr->cr_length) * ss; meta->cr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cr, 0xff, size); /* Physical Disk Data. */ size = GET32(meta, hdr->pdd_length) * ss; meta->pdd = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdd, 0xff, size); SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE); memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24); SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference)); SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF); SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID); /* Bad Block Management Log. */ if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; meta->bbm = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->bbm, 0xff, size); SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE); SET32(meta, bbm->Entry_Count, 0); SET32(meta, bbm->Spare_Block_Count, 0); } } static void ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src) { u_int ss; dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss); dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss); dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss); dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss); if (src->bbm != NULL) { dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss); } } static void ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src) { struct ddf_pd_entry *pde, *spde; int i, j; for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) { spde = &src->pdr->entry[i]; if (isff(spde->PD_GUID, 24)) continue; j = ddf_meta_find_pd(meta, NULL, GET32(src, pdr->entry[i].PD_Reference)); if (j < 0) { j = ddf_meta_find_pd(meta, NULL, 0xffffffff); pde = &meta->pdr->entry[j]; memcpy(pde, spde, sizeof(*pde)); } else { pde = &meta->pdr->entry[j]; SET16D(meta, pde->PD_State, GET16D(meta, pde->PD_State) | GET16D(src, pde->PD_State)); } } } static void ddf_meta_free(struct ddf_meta *meta) { if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->pdr != NULL) { free(meta->pdr, M_MD_DDF); meta->pdr = NULL; } if (meta->vdr != NULL) { free(meta->vdr, M_MD_DDF); meta->vdr = NULL; } if (meta->cr != NULL) { free(meta->cr, M_MD_DDF); meta->cr = NULL; } if (meta->pdd != NULL) { free(meta->pdd, M_MD_DDF); meta->pdd = NULL; } if (meta->bbm != NULL) { free(meta->bbm, M_MD_DDF); meta->bbm = NULL; } } static void ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; u_int ss, size; meta->bigendian = sample->bigendian; ss = meta->sectorsize = sample->sectorsize; meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, sample->hdr, ss); meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss); meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry)); getnanotime(&ts); clock_ts_to_ct(&ts, &ct); snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xf); size = GET16(sample, hdr->Configuration_Record_Length) * ss; meta->vdc = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdc, 0xff, size); SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE); memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24); SET32(meta, vdc->Sequence_Number, 0); } static void ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src, uint8_t *GUID, int started) { struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; int vnew, bvnew, bvd, size; u_int ss; vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)]; vdc = ddf_meta_find_vdc(src, GUID); if (GET8D(src, vdc->Secondary_Element_Count) == 1) bvd = 0; else bvd = GET8D(src, vdc->Secondary_Element_Seq); size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize; if (dst->vdc == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, vdc->Sequence_Number))) > 0)) vnew = 1; else vnew = 0; if (dst->bvdc[bvd] == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, bvdc[bvd]->Sequence_Number))) > 0)) bvnew = 1; else bvnew = 0; if (vnew) { dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; if (dst->hdr != NULL) free(dst->hdr, M_MD_DDF); dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); if (dst->cdr != NULL) free(dst->cdr, M_MD_DDF); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); if (dst->vde != NULL) free(dst->vde, M_MD_DDF); dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry)); if (dst->vdc != NULL) free(dst->vdc, M_MD_DDF); dst->vdc = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->vdc, vdc, size); } if (bvnew) { if (dst->bvdc[bvd] != NULL) free(dst->bvdc[bvd], M_MD_DDF); dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->bvdc[bvd], vdc, size); } } static void ddf_vol_meta_free(struct ddf_vol_meta *meta) { int i; if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->vde != NULL) { free(meta->vde, M_MD_DDF); meta->vde = NULL; } if (meta->vdc != NULL) { free(meta->vdc, M_MD_DDF); meta->vdc = NULL; } for (i = 0; i < DDF_MAX_DISKS_HARD; i++) { if (meta->bvdc[i] != NULL) { free(meta->bvdc[i], M_MD_DDF); meta->bvdc[i] = NULL; } } } static int ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size) { struct ddf_vdc_record *vdc; off_t beg[32], end[32], beg1, end1; uint64_t *offp; int i, j, n, num, pos; uint32_t ref; *off = 0; *size = 0; ref = GET32(meta, pdd->PD_Reference); pos = ddf_meta_find_pd(meta, NULL, ref); beg[0] = 0; end[0] = GET64(meta, pdr->entry[pos].Configured_Size); n = 1; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++) if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref) break; if (pos == GET16D(meta, vdc->Primary_Element_Count)) continue; offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[ GET16(meta, hdr->Max_Primary_Element_Entries)]); beg1 = GET64P(meta, offp + pos); end1 = beg1 + GET64D(meta, vdc->Block_Count); for (j = 0; j < n; j++) { if (beg[j] >= end1 || end[j] <= beg1 ) continue; if (beg[j] < beg1 && end[j] > end1) { beg[n] = end1; end[n] = end[j]; end[j] = beg1; n++; } else if (beg[j] < beg1) end[j] = beg1; else beg[j] = end1; } } for (j = 0; j < n; j++) { if (end[j] - beg[j] > *size) { *off = beg[j]; *size = end[j] - beg[j]; } } return ((*size > 0) ? 1 : 0); } static void ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf) { const char *b; int i; b = meta->vdr->entry[num].VD_Name; for (i = 15; i >= 0; i--) if (b[i] != 0x20) break; memcpy(buf, b, i + 1); buf[i + 1] = 0; } static void ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf) { int len; len = min(strlen(buf), 16); memset(meta->vde->VD_Name, 0x20, 16); memcpy(meta->vde->VD_Name, buf, len); } static int ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_header *ahdr, *hdr; char *abuf, *buf; off_t plba, slba, lba; int error, len, i; u_int ss; uint32_t val; ddf_meta_free(meta); pp = cp->provider; ss = meta->sectorsize = pp->sectorsize; /* Read anchor block. */ abuf = g_read_data(cp, pp->mediasize - ss, ss, &error); if (abuf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (error); } ahdr = (struct ddf_header *)abuf; /* Check if this is an DDF RAID struct */ if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 1; else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 0; else { G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name); error = EINVAL; goto done; } if (ahdr->Header_Type != DDF_HEADER_ANCHOR) { G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name); error = EINVAL; goto done; } meta->hdr = ahdr; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); meta->hdr = NULL; if (crc32(ahdr, ss) != val) { G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name); error = EINVAL; goto done; } if ((plba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } if (slba != -1 && (slba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } lba = plba; doread: error = 0; ddf_meta_free(meta); /* Read header block. */ buf = g_read_data(cp, lba * ss, ss, &error); if (buf == NULL) { readerror: G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).", (lba == plba) ? "primary" : "secondary", pp->name, error); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name); goto done; } meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, buf, ss); g_free(buf); hdr = meta->hdr; val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); if (hdr->Signature != ahdr->Signature || crc32(meta->hdr, ss) != val || memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) || GET64(meta, hdr->Primary_Header_LBA) != plba || GET64(meta, hdr->Secondary_Header_LBA) != slba) { hdrerror: G_RAID_DEBUG(1, "DDF %s metadata check failed on %s", (lba == plba) ? "primary" : "secondary", pp->name); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name); error = EINVAL; goto done; } if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) || (lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY)) goto hdrerror; len = 1; len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length)); len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length)); len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length)); len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length)); len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length)); if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->bbmlog_length)); if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length)); if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length)); if ((plba + len) * ss >= pp->mediasize) goto hdrerror; if (slba != -1 && (slba + len) * ss >= pp->mediasize) goto hdrerror; /* Workaround for Adaptec implementation. */ if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) { SET16(meta, hdr->Max_Primary_Element_Entries, min(GET16(meta, hdr->Max_PD_Entries), (GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12)); } if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS || GET32(meta, hdr->pdr_length) * ss >= MAXPHYS || GET32(meta, hdr->vdr_length) * ss >= MAXPHYS || GET32(meta, hdr->cr_length) * ss >= MAXPHYS || GET32(meta, hdr->pdd_length) * ss >= MAXPHYS || GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); goto hdrerror; } /* Read controller data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, GET32(meta, hdr->cd_length) * ss, &error); if (buf == NULL) goto readerror; meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss); g_free(buf); if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE) goto hdrerror; /* Read physical disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, GET32(meta, hdr->pdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss); g_free(buf); if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE) goto hdrerror; /* * Workaround for reading metadata corrupted due to graid bug. * XXX: Remove this before we have disks above 128PB. :) */ if (meta->bigendian) { for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (isff(meta->pdr->entry[i].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[i].PD_Reference) == 0xffffffff) continue; if (GET64(meta, pdr->entry[i].Configured_Size) >= (1ULL << 48)) { SET16(meta, pdr->entry[i].PD_State, GET16(meta, pdr->entry[i].PD_State) & ~DDF_PDE_FAILED); SET64(meta, pdr->entry[i].Configured_Size, GET64(meta, pdr->entry[i].Configured_Size) & ((1ULL << 48) - 1)); } } } /* Read virtual disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, GET32(meta, hdr->vdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss); g_free(buf); if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE) goto hdrerror; /* Read configuration records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, GET32(meta, hdr->cr_length) * ss, &error); if (buf == NULL) goto readerror; meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss); g_free(buf); /* Read physical disk data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, GET32(meta, hdr->pdd_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss); g_free(buf); if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE) goto hdrerror; i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference)); if (i < 0) goto hdrerror; /* Read BBM Log. */ if (GET32(meta, hdr->bbmlog_section) != 0xffffffff && GET32(meta, hdr->bbmlog_length) != 0) { buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, GET32(meta, hdr->bbmlog_length) * ss, &error); if (buf == NULL) goto readerror; meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss); g_free(buf); if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE) goto hdrerror; } done: g_free(abuf); if (error != 0) ddf_meta_free(meta); return (error); } static int ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_vdc_record *vdc; off_t alba, plba, slba, lba; u_int ss, size; int error, i, num; pp = cp->provider; ss = pp->sectorsize; lba = alba = pp->mediasize / ss - 1; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); next: SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR : (lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY); SET32(meta, hdr->CRC, 0xffffffff); SET32(meta, hdr->CRC, crc32(meta->hdr, ss)); error = g_write_data(cp, lba * ss, meta->hdr, ss); if (error != 0) { err: G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); if (lba != alba) goto done; } if (lba == alba) { lba = plba; goto next; } size = GET32(meta, hdr->cd_length) * ss; SET32(meta, cdr->CRC, 0xffffffff); SET32(meta, cdr->CRC, crc32(meta->cdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, meta->cdr, size); if (error != 0) goto err; size = GET32(meta, hdr->pdr_length) * ss; SET32(meta, pdr->CRC, 0xffffffff); SET32(meta, pdr->CRC, crc32(meta->pdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, meta->pdr, size); if (error != 0) goto err; size = GET32(meta, hdr->vdr_length) * ss; SET32(meta, vdr->CRC, 0xffffffff); SET32(meta, vdr->CRC, crc32(meta->vdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, meta->vdr, size); if (error != 0) goto err; size = GET16(meta, hdr->Configuration_Record_Length) * ss; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); SET32D(meta, vdc->CRC, 0xffffffff); SET32D(meta, vdc->CRC, crc32(vdc, size)); } error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, meta->cr, size * num); if (error != 0) goto err; size = GET32(meta, hdr->pdd_length) * ss; SET32(meta, pdd->CRC, 0xffffffff); SET32(meta, pdd->CRC, crc32(meta->pdd, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, meta->pdd, size); if (error != 0) goto err; if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; SET32(meta, bbm->CRC, 0xffffffff); SET32(meta, bbm->CRC, crc32(meta->bbm, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, meta->bbm, size); if (error != 0) goto err; } done: if (lba == plba && slba != -1) { lba = slba; goto next; } return (error); } static int ddf_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_DDF); return (error); } static struct g_raid_volume * g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID) { struct g_raid_volume *vol; struct g_raid_md_ddf_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0) break; } return (vol); } static struct g_raid_disk * g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id) { struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct ddf_meta *meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; if (GUID != NULL) { if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0) break; } else { if (GET32(meta, pdd->PD_Reference) == id) break; } } return (disk); } static int g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { if (vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_ddf_purge_disks(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_ddf_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_ddf_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_DDF); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); #endif return (0); } static int g_raid_md_ddf_supported(int level, int qual, int disks, int force) { if (disks > DDF_MAX_DISKS_HARD) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (qual == G_RAID_VOLUME_RLQ_R1SM) { if (!force && disks != 2) return (0); } else if (qual == G_RAID_VOLUME_RLQ_R1MM) { if (!force && disks != 3) return (0); } else return (0); break; case G_RAID_VOLUME_RL_RAID3: if (qual != G_RAID_VOLUME_RLQ_R3P0 && qual != G_RAID_VOLUME_RLQ_R3PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID4: if (qual != G_RAID_VOLUME_RLQ_R4P0 && qual != G_RAID_VOLUME_RLQ_R4PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (qual != G_RAID_VOLUME_RLQ_R5RA && qual != G_RAID_VOLUME_RLQ_R5RS && qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID6: if (qual != G_RAID_VOLUME_RLQ_R6RA && qual != G_RAID_VOLUME_RLQ_R6RS && qual != G_RAID_VOLUME_RLQ_R6LA && qual != G_RAID_VOLUME_RLQ_R6LS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAIDMDF: if (qual != G_RAID_VOLUME_RLQ_RMDFRA && qual != G_RAID_VOLUME_RLQ_RMDFRS && qual != G_RAID_VOLUME_RLQ_RMDFLA && qual != G_RAID_VOLUME_RLQ_RMDFLS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (qual != G_RAID_VOLUME_RLQ_R1EA && qual != G_RAID_VOLUME_RLQ_R1EO) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5E: if (qual != G_RAID_VOLUME_RLQ_R5ERA && qual != G_RAID_VOLUME_RLQ_R5ERS && qual != G_RAID_VOLUME_RLQ_R5ELA && qual != G_RAID_VOLUME_RLQ_R5ELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5EE: if (qual != G_RAID_VOLUME_RLQ_R5EERA && qual != G_RAID_VOLUME_RLQ_R5EERS && qual != G_RAID_VOLUME_RLQ_R5EELA && qual != G_RAID_VOLUME_RLQ_R5EELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5R: if (qual != G_RAID_VOLUME_RLQ_R5RRA && qual != G_RAID_VOLUME_RLQ_R5RRS && qual != G_RAID_VOLUME_RLQ_R5RLA && qual != G_RAID_VOLUME_RLQ_R5RLS) return (0); if (disks < 3) return (0); break; default: return (0); } return (1); } static int g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_meta *pdmeta, *gmeta; struct ddf_vdc_record *vdc1; struct ddf_sa_record *sa; off_t size, eoff = 0, esize = 0; uint64_t *val2; int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos; int i, resurrection = 0; uint32_t reference; sc = disk->d_softc; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; reference = GET32(&pd->pd_meta, pdd->PD_Reference); pv = vol->v_md_data; vmeta = &pv->pv_meta; gmeta = &mdi->mdio_meta; /* Find disk position in metadata by its reference. */ disk_pos = ddf_meta_find_disk(vmeta, reference, &md_disk_bvd, &md_disk_pos); md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not a present part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If disk has some metadata for this volume - erase. */ if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) SET32D(pdmeta, vdc1->Signature, 0xffffffff); /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { G_RAID_DEBUG1(1, sc, "No free partitions on disk %s", g_raid_get_diskname(disk)); goto nofit; } ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } eoff *= pd->pd_meta.sectorsize; esize *= pd->pd_meta.sectorsize; size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && esize < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), esize, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size; md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX } else { nofit: if (disk->d_state == G_RAID_DISK_S_NONE) g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } /* * If spare is committable, delete spare record. * Othersize, mark it active and leave there. */ sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa != NULL) { if ((GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_REVERTIBLE) == 0) { SET32D(&pd->pd_meta, sa->Signature, 0xffffffff); } else { SET8D(&pd->pd_meta, sa->Spare_Type, GET8D(&pd->pd_meta, sa->Spare_Type) | DDF_SAR_TYPE_ACTIVE); } } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = eoff; sd->sd_size = esize; } else if (pdmeta->cr != NULL && (vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) { val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512; sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & (DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = 0; } else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 || (GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) != DDF_VDE_INIT_FULL) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_ddf_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) < GET16(&pd->pd_meta, hdr->Max_Partitions)) { update = g_raid_md_ddf_start_disk(disk, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_ddf(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_ddf_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; uint64_t *val2; int i, j, bvd; sc = vol->v_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pv = vol->v_md_data; vmeta = &pv->pv_meta; vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level); vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ); if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 && GET8(vmeta, vdc->Secondary_RAID_Level) == 0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_sectorsize = GET16(vmeta, vdc->Block_Size); if (vol->v_sectorsize == 0xffff) vol->v_sectorsize = vmeta->sectorsize; vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size); vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) * GET8(vmeta, vdc->Secondary_Element_Count); vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks); vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial); vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method); if (GET8(vmeta, vdc->Rotate_Parity_count) > 31) vol->v_rotate_parity = 1; else vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count); vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize; for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) { if (j == GET16(vmeta, vdc->Primary_Element_Count)) { j = 0; bvd++; } sd = &vol->v_subdisks[i]; if (vmeta->bvdc[bvd] == NULL) { sd->sd_offset = 0; sd->sd_size = GET64(vmeta, vdc->Block_Count) * vol->v_sectorsize; continue; } val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize; sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) * vol->v_sectorsize; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL) g_raid_md_ddf_start_disk(disk, vol); } pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_ddf_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_ddf_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_ddf_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct g_raid_volume *vol; struct ddf_meta *pdmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_vd_entry *vde; int i, j, k, num, have, need, cnt, spare; uint32_t val; char buf[17]; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; spare = -1; if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, pdmeta); else ddf_meta_update(&mdi->mdio_meta, pdmeta); num = GETCRNUM(pdmeta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(pdmeta, j); val = GET32D(pdmeta, vdc->Signature); if (val == DDF_SA_SIGNATURE && spare == -1) spare = 1; if (val != DDF_VDCR_SIGNATURE) continue; spare = 0; k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID); if (k < 0) continue; vde = &pdmeta->vdr->entry[k]; /* Look for volume with matching ID. */ vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID); if (vol == NULL) { ddf_meta_get_name(pdmeta, k, buf); vol = g_raid_create_volume(sc, buf, GET16D(pdmeta, vde->VD_Number)); pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_ddf_go, vol); mdi->mdio_starting++; } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ vmeta = &pv->pv_meta; ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started); } if (spare == 1) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_ddf_refill(sc); } TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; vmeta = &pv->pv_meta; if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL) continue; if (pv->pv_started) { if (g_raid_md_ddf_start_disk(disk, vol)) g_raid_md_write_ddf(md, vol, NULL, NULL); continue; } /* If we collected all needed disks - start array. */ need = 0; have = 0; for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) { if (vmeta->bvdc[k] == NULL) { need += GET16(vmeta, vdc->Primary_Element_Count); continue; } cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count); need += cnt; for (i = 0; i < cnt; i++) { val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]); if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL) have++; } } G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks", vol->v_name, have, need); if (have == need) g_raid_md_ddf_start(vol); } } static int g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp, struct gctl_req *req, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; struct g_raid_md_ddf_object *mdi, *mdi1; char name[16]; const char *fmtopt; int be = 1; mdi = (struct g_raid_md_ddf_object *)md; fmtopt = gctl_get_asciiparam(req, "fmtopt"); if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0) be = 1; else if (strcasecmp(fmtopt, "LE") == 0) be = 0; else { gctl_error(req, "Incorrect fmtopt argument."); return (G_RAID_MD_TASTE_FAIL); } /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi1->mdio_bigendian != be) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct ddf_meta meta; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct g_geom *geom; int error, result, be; char name[16]; G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name); mdi = (struct g_raid_md_ddf_object *)md; pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); bzero(&meta, sizeof(meta)); error = ddf_meta_read(cp, &meta); g_topology_lock(); if (error != 0) return (G_RAID_MD_TASTE_FAIL); be = meta.bigendian; /* Metadata valid. Print it. */ g_raid_md_ddf_print(&meta); /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi->mdio_bigendian != be) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); pd->pd_meta = meta; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_ddf_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_ddf_pervolume *pv; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_ddf_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_ddf(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD]; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_sa_record *sa; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize; intmax_t *sizearg, *striparg; int i, numdisks, len, level, qual; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_ddf_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { gctl_error(req, "No free partitions " "on disk '%s'.", diskname); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; ddf_meta_unused_range(&pd->pd_meta, &offs[i], &esize); offs[i] *= pp->sectorsize; size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; ddf_meta_create(disk, &mdi->mdio_meta); if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, GET64(&pd->pd_meta, pdr->entry[0].Configured_Size) * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta); pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID4 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else if (level == G_RAID_VOLUME_RL_RAID5R) { vol->v_mediasize = size * (numdisks - 1); vol->v_rotate_parity = 1024; } else if (level == G_RAID_VOLUME_RL_RAID6 || level == G_RAID_VOLUME_RL_RAID5E || level == G_RAID_VOLUME_RL_RAID5EE) vol->v_mediasize = size * (numdisks - 2); else if (level == G_RAID_VOLUME_RL_RAIDMDF) { if (numdisks < 5) vol->v_mdf_pdisks = 2; else vol->v_mdf_pdisks = 3; vol->v_mdf_polynomial = 0x11d; vol->v_mdf_method = 0x00; vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks); } else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = offs[i]; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_ddf_purge_disks(sc); g_raid_md_write_ddf(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_ddf(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); ddf_meta_create(disk, &mdi->mdio_meta); sa = ddf_meta_find_sa(&pd->pd_meta, 1); if (sa != NULL) { SET32D(&pd->pd_meta, sa->Signature, DDF_SA_SIGNATURE); SET8D(&pd->pd_meta, sa->Spare_Type, 0); SET16D(&pd->pd_meta, sa->Populated_SAEs, 0); SET16D(&pd->pd_meta, sa->MAX_SAE_Supported, (GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize - sizeof(struct ddf_sa_record)) / sizeof(struct ddf_sa_entry)); } if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_raid_md_write_ddf(md, NULL, NULL, NULL); g_raid_md_ddf_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_meta *gmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_sa_record *sa; uint64_t *val2; int i, j, pos, bvd, size; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; gmeta = &mdi->mdio_meta; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* * Clear disk flags to let only really needed ones to be reset. * Do it only if there are no volumes in starting state now, * as they can update disk statuses yet and we may kill innocent. */ if (mdi->mdio_starting == 0) { for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) & ~(DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)); if ((GET16(gmeta, pdr->entry[i].PD_State) & DDF_PDE_PFA) == 0) SET16(gmeta, pdr->entry[i].PD_State, 0); } } /* Generate/update new per-volume metadata. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; if (vol->v_stopping || !pv->pv_started) continue; vmeta = &pv->pv_meta; SET32(vmeta, vdc->Sequence_Number, GET32(vmeta, vdc->Sequence_Number) + 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) SET16(vmeta, vdc->Primary_Element_Count, 2); else SET16(vmeta, vdc->Primary_Element_Count, vol->v_disks_count); SET8(vmeta, vdc->Stripe_Size, ffs(vol->v_strip_size / vol->v_sectorsize) - 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) { SET8(vmeta, vdc->Primary_RAID_Level, DDF_VDCR_RAID1); SET8(vmeta, vdc->RLQ, 0); SET8(vmeta, vdc->Secondary_Element_Count, vol->v_disks_count / 2); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } else { SET8(vmeta, vdc->Primary_RAID_Level, vol->v_raid_level); SET8(vmeta, vdc->RLQ, vol->v_raid_level_qualifier); SET8(vmeta, vdc->Secondary_Element_Count, 1); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } SET8(vmeta, vdc->Secondary_Element_Seq, 0); SET64(vmeta, vdc->Block_Count, 0); SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize); SET16(vmeta, vdc->Block_Size, vol->v_sectorsize); SET8(vmeta, vdc->Rotate_Parity_count, fls(vol->v_rotate_parity) - 1); SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks); SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial, vol->v_mdf_polynomial); SET8(vmeta, vdc->MDF_Constant_Generation_Method, vol->v_mdf_method); SET16(vmeta, vde->VD_Number, vol->v_global_id); if (vol->v_state <= G_RAID_VOLUME_S_BROKEN) SET8(vmeta, vde->VD_State, DDF_VDE_FAILED); else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED); else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL) SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL); else SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL); if (vol->v_dirty || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0) SET8(vmeta, vde->VD_State, GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY); SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX ddf_meta_put_name(vmeta, vol->v_name); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; bvd = i / GET16(vmeta, vdc->Primary_Element_Count); pos = i % GET16(vmeta, vdc->Primary_Element_Count); disk = sd->sd_disk; if (disk != NULL) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (vmeta->bvdc[bvd] == NULL) { size = GET16(vmeta, hdr->Configuration_Record_Length) * vmeta->sectorsize; vmeta->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memset(vmeta->bvdc[bvd], 0xff, size); } memcpy(vmeta->bvdc[bvd], vmeta->vdc, sizeof(struct ddf_vdc_record)); SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd); SET64(vmeta, bvdc[bvd]->Block_Count, sd->sd_size / vol->v_sectorsize); SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos], GET32(&pd->pd_meta, pdd->PD_Reference)); val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); SET64P(vmeta, val2 + pos, sd->sd_offset / vol->v_sectorsize); } if (vmeta->bvdc[bvd] == NULL) continue; j = ddf_meta_find_pd(gmeta, NULL, GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos])); if (j < 0) continue; SET16(gmeta, pdr->entry[j].PD_Type, GET16(gmeta, pdr->entry[j].PD_Type) | DDF_PDE_PARTICIPATING); if (sd->sd_state == G_RAID_SUBDISK_S_NONE) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_MISSING)); else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_REBUILD); else SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_ONLINE); } } /* Mark spare and failed disks as such. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; i = ddf_meta_find_pd(gmeta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); if (i < 0) continue; if (disk->d_state == G_RAID_DISK_S_FAILED) { SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); } if (disk->d_state != G_RAID_DISK_S_SPARE) continue; sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa == NULL || (GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_DEDICATED) == 0) { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_GLOBAL_SPARE); } else { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_CONFIG_SPARE); } SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | DDF_PDE_ONLINE); } /* Remove disks without "participating" flag (unused). */ for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; if ((GET16(gmeta, pdr->entry[i].PD_Type) & (DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 || g_raid_md_ddf_get_disk(sc, NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL) j = i; else memset(&gmeta->pdr->entry[i], 0xff, sizeof(struct ddf_pd_entry)); } SET16(gmeta, pdr->Populated_PDEs, j + 1); /* Update per-disk metadata and write them. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; /* Update PDR. */ memcpy(pd->pd_meta.pdr, gmeta->pdr, GET32(&pd->pd_meta, hdr->pdr_length) * pd->pd_meta.sectorsize); /* Update VDR. */ SET16(&pd->pd_meta, vdr->Populated_VDEs, 0); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; i = ddf_meta_find_vd(&pd->pd_meta, pv->pv_meta.vde->VD_GUID); if (i < 0) i = ddf_meta_find_vd(&pd->pd_meta, NULL); if (i >= 0) memcpy(&pd->pd_meta.vdr->entry[i], pv->pv_meta.vde, sizeof(struct ddf_vd_entry)); } /* Update VDC. */ if (mdi->mdio_starting == 0) { /* Remove all VDCs to restore needed later. */ j = GETCRNUM(&pd->pd_meta); for (i = 0; i < j; i++) { vdc = GETVDCPTR(&pd->pd_meta, i); if (GET32D(&pd->pd_meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff); } } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { vol = sd->sd_volume; if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; vmeta = &pv->pv_meta; vdc = ddf_meta_find_vdc(&pd->pd_meta, vmeta->vde->VD_GUID); if (vdc == NULL) vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL); if (vdc != NULL) { bvd = sd->sd_pos / GET16(vmeta, vdc->Primary_Element_Count); memcpy(vdc, vmeta->bvdc[bvd], GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize); } } G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(disk)); g_raid_md_ddf_print(&pd->pd_meta); ddf_meta_write(disk->d_consumer, &pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_ddf(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_ddf_perdisk *pd; struct g_raid_subdisk *sd; int i; sc = md->mdo_softc; pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(tdisk)); i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA); if (tdisk->d_consumer != NULL) ddf_meta_write(tdisk->d_consumer, &pd->pd_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, tdisk); g_raid_md_ddf_refill(sc); return (0); } static int g_raid_md_free_disk_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_ddf_perdisk *pd; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; ddf_meta_free(&pd->pd_meta); free(pd, M_MD_DDF); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_ddf_object *mdi; struct g_raid_md_ddf_pervolume *pv; mdi = (struct g_raid_md_ddf_object *)md; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; ddf_vol_meta_free(&pv->pv_meta); if (!pv->pv_started) { pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); } free(pv, M_MD_DDF); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_ddf(struct g_raid_md_object *md) { struct g_raid_md_ddf_object *mdi; mdi = (struct g_raid_md_ddf_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } ddf_meta_free(&mdi->mdio_meta); return (0); } G_RAID_MD_DECLARE(ddf, "DDF"); Index: head/sys/geom/raid/md_intel.c =================================================================== --- head/sys/geom/raid/md_intel.c (revision 350693) +++ head/sys/geom/raid/md_intel.c (revision 350694) @@ -1,2717 +1,2718 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_UNINITIALIZED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t offset_hi; uint32_t disk_sectors_hi; uint32_t stripe_count_hi; uint32_t filler_2[4]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; #define INTEL_ST_BOOTABLE 0x00000001 #define INTEL_ST_BOOT_DEVICE 0x00000002 #define INTEL_ST_READ_COALESCING 0x00000004 #define INTEL_ST_WRITE_COALESCING 0x00000008 #define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 #define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 #define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 #define INTEL_ST_VERIFY_AND_FIX 0x00000080 #define INTEL_ST_MAP_STATE_UNINIT 0x00000100 #define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 #define INTEL_ST_CLONE_N_GO 0x00000400 #define INTEL_ST_CLONE_MAN_SYNC 0x00000800 #define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; #define INTEL_CNGST_UPDATED 0 #define INTEL_CNGST_NEEDS_UPDATE 1 #define INTEL_CNGST_MASTER_MISSING 2 uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t curr_migr_unit_hi; uint32_t filler_1[3]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 #define INTEL_F_DISABLED 0x80 uint32_t owner_cfg_num; uint32_t sectors_hi; uint32_t filler[3]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_EXT_STRIP 0x00000040 #define INTEL_ATTR_NVM_CACHE 0x02000000 #define INTEL_ATTR_2TB_DISK 0x04000000 #define INTEL_ATTR_BBM 0x08000000 #define INTEL_ATTR_NVM_CACHE2 0x10000000 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t error_log_pos; uint8_t dummy_2[1]; uint32_t cache_size; uint32_t orig_config_id; uint32_t pwr_cycle_count; uint32_t bbm_log_size; uint32_t filler_0[35]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_ATTR_SUPPORTED ( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 | \ INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 | \ INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | \ INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM ) #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_pervolume { int pv_volume_pos; int pv_cng; int pv_cng_man_sync; int pv_cng_master_disk; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_orig_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_volume_t g_raid_md_free_volume_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_enable = 1, .mdc_priority = 100 }; static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static off_t intel_get_map_offset(struct intel_raid_map *mmap) { off_t offset = (off_t)mmap->offset_hi << 32; offset += mmap->offset; return (offset); } static void intel_set_map_offset(struct intel_raid_map *mmap, off_t offset) { mmap->offset = offset & 0xffffffff; mmap->offset_hi = offset >> 32; } static off_t intel_get_map_disk_sectors(struct intel_raid_map *mmap) { off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32; disk_sectors += mmap->disk_sectors; return (disk_sectors); } static void intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors) { mmap->disk_sectors = disk_sectors & 0xffffffff; mmap->disk_sectors_hi = disk_sectors >> 32; } static void intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count) { mmap->stripe_count = stripe_count & 0xffffffff; mmap->stripe_count_hi = stripe_count >> 32; } static off_t intel_get_disk_sectors(struct intel_raid_disk *disk) { off_t sectors = (off_t)disk->sectors_hi << 32; sectors += disk->sectors; return (sectors); } static void intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors) { disk->sectors = sectors & 0xffffffff; disk->sectors_hi = sectors >> 32; } static off_t intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol) { off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32; curr_migr_unit += vol->curr_migr_unit; return (curr_migr_unit); } static void intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit) { vol->curr_migr_unit = curr_migr_unit & 0xffffffff; vol->curr_migr_unit_hi = curr_migr_unit >> 32; } static char * intel_status2str(int status) { switch (status) { case INTEL_S_READY: return ("READY"); case INTEL_S_UNINITIALIZED: return ("UNINITIALIZED"); case INTEL_S_DEGRADED: return ("DEGRADED"); case INTEL_S_FAILURE: return ("FAILURE"); default: return ("UNKNOWN"); } } static char * intel_type2str(int type) { switch (type) { case INTEL_T_RAID0: return ("RAID0"); case INTEL_T_RAID1: return ("RAID1"); case INTEL_T_RAID5: return ("RAID5"); default: return ("UNKNOWN"); } } static char * intel_cngst2str(int cng_state) { switch (cng_state) { case INTEL_CNGST_UPDATED: return ("UPDATED"); case INTEL_CNGST_NEEDS_UPDATE: return ("NEEDS_UPDATE"); case INTEL_CNGST_MASTER_MISSING: return ("MASTER_MISSING"); default: return ("UNKNOWN"); } } static char * intel_mt2str(int type) { switch (type) { case INTEL_MT_INIT: return ("INIT"); case INTEL_MT_REBUILD: return ("REBUILD"); case INTEL_MT_VERIFY: return ("VERIFY"); case INTEL_MT_GEN_MIGR: return ("GEN_MIGR"); case INTEL_MT_STATE_CHANGE: return ("STATE_CHANGE"); case INTEL_MT_REPAIR: return ("REPAIR"); default: return ("UNKNOWN"); } } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("error_log_size %d\n", meta->error_log_size); printf("attributes 0x%b\n", meta->attributes, "\020" "\001RAID0" "\002RAID1" "\003RAID10" "\004RAID1E" "\005RAID15" "\006RAIDCNG" "\007EXT_STRIP" "\032NVM_CACHE" "\0332TB_DISK" "\034BBM" "\035NVM_CACHE" "\0362TB" "\037PM" "\040CHECKSUM"); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("error_log_pos %u\n", meta->error_log_pos); printf("cache_size %u\n", meta->cache_size); printf("orig_config_id 0x%08x\n", meta->orig_config_id); printf("pwr_cycle_count %u\n", meta->pwr_cycle_count); printf("bbm_log_size %u\n", meta->bbm_log_size); printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n"); printf("DISK# serial disk_sectors disk_sectors_hi disk_id flags owner\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u %u 0x%08x 0x%b %08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].sectors_hi, meta->disk[i].id, meta->disk[i].flags, "\20\01S\02A\03F\04O\05D", meta->disk[i].owner_cfg_num); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state 0x%b\n", mvol->state, "\020" "\001BOOTABLE" "\002BOOT_DEVICE" "\003READ_COALESCING" "\004WRITE_COALESCING" "\005LAST_SHUTDOWN_DIRTY" "\006HIDDEN_AT_BOOT" "\007CURRENTLY_HIDDEN" "\010VERIFY_AND_FIX" "\011MAP_STATE_UNINIT" "\012NO_AUTO_RECOVERY" "\013CLONE_N_GO" "\014CLONE_MAN_SYNC" "\015CNG_MASTER_DISK_NUM"); printf(" reserved %u\n", mvol->reserved); printf(" migr_priority %u\n", mvol->migr_priority); printf(" num_sub_vols %u\n", mvol->num_sub_vols); printf(" tid %u\n", mvol->tid); printf(" cng_master_disk %u\n", mvol->cng_master_disk); printf(" cache_policy %u\n", mvol->cache_policy); printf(" cng_state %u (%s)\n", mvol->cng_state, intel_cngst2str(mvol->cng_state)); printf(" cng_sub_state %u\n", mvol->cng_sub_state); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" curr_migr_unit_hi %u\n", mvol->curr_migr_unit_hi); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u (%s)\n", mvol->migr_type, intel_mt2str(mvol->migr_type)); printf(" dirty %u\n", mvol->dirty); printf(" fs_state %u\n", mvol->fs_state); printf(" verify_errors %u\n", mvol->verify_errors); printf(" bad_blocks %u\n", mvol->bad_blocks); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" offset_hi %u\n", mmap->offset_hi); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" disk_sectors_hi %u\n", mmap->disk_sectors_hi); printf(" stripe_count %u\n", mmap->stripe_count); printf(" stripe_count_hi %u\n", mmap->stripe_count_hi); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u (%s)\n", mmap->status, intel_status2str(mmap->status)); printf(" type %u (%s)\n", mmap->type, intel_type2str(mmap->type)); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap, *mmap1; char *buf; int error, i, j, k, left, size; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536 || meta->config_size < sizeof(struct intel_raid_conf)) { G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } size = meta->config_size; meta = malloc(size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, min(size, pp->sectorsize)); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } /* Validate metadata size. */ size = sizeof(struct intel_raid_conf) + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + sizeof(struct intel_raid_vol) * meta->total_volumes; if (size > meta->config_size) { badsize: G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", meta->config_size, size); free(meta, M_MD_INTEL); return (NULL); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; if (mvol->migr_state) { size += sizeof(struct intel_raid_map); if (size > meta->config_size) goto badsize; mmap = intel_get_map(mvol, 1); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; } } g_raid_md_intel_print(meta); if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) { G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'", meta->version); free(meta, M_MD_INTEL); return (NULL); } if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 && (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) { G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x", meta->attributes & ~INTEL_ATTR_SUPPORTED); free(meta, M_MD_INTEL); return (NULL); } /* Validate disk indexes. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { mmap = intel_get_map(mvol, j); for (k = 0; k < mmap->total_disks; k++) { if ((mmap->disk_idx[k] & INTEL_DI_IDX) > meta->total_disks) { G_RAID_DEBUG(1, "Intel metadata disk" " index %d too big (>%d)", mmap->disk_idx[k] & INTEL_DI_IDX, meta->total_disks); free(meta, M_MD_INTEL); return (NULL); } } } } /* Validate migration types. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); /* Deny unknown migration types. */ if (mvol->migr_state && mvol->migr_type != INTEL_MT_INIT && mvol->migr_type != INTEL_MT_REBUILD && mvol->migr_type != INTEL_MT_VERIFY && mvol->migr_type != INTEL_MT_GEN_MIGR && mvol->migr_type != INTEL_MT_REPAIR) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " migration type %d", mvol->migr_type); free(meta, M_MD_INTEL); return (NULL); } /* Deny general migrations except SINGLE->RAID1. */ if (mvol->migr_state && mvol->migr_type == INTEL_MT_GEN_MIGR) { mmap = intel_get_map(mvol, 0); mmap1 = intel_get_map(mvol, 1); if (mmap1->total_disks != 1 || mmap->type != INTEL_T_RAID1 || mmap->total_disks != 2 || mmap->offset != mmap1->offset || mmap->disk_sectors != mmap1->disk_sectors || mmap->total_domains != mmap->total_disks || mmap->offset_hi != mmap1->offset_hi || mmap->disk_sectors_hi != mmap1->disk_sectors_hi || (mmap->disk_idx[0] != mmap1->disk_idx[0] && mmap->disk_idx[0] != mmap1->disk_idx[1])) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " variant of general migration"); free(meta, M_MD_INTEL); return (NULL); } } } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = howmany(meta->config_size, pp->sectorsize); buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000) - 1); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = meta->orig_config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_intel_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (!force && disks > 6) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; struct g_raid_md_intel_pervolume *pv; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { pv = mvol->v_md_data; if (pv->pv_volume_pos == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0, migr_global, i; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) && !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { off_t disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (sd->sd_offset + sd->sd_size + 4096 > disk_sectors * 512) { G_RAID_DEBUG1(1, sc, "Disk too small (%llu < %llu)", (unsigned long long) disk_sectors * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED); else if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { pv = sd->sd_volume->v_md_data; mvol = intel_get_volume(meta, pv->pv_volume_pos); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; migr_global = 1; for (i = 0; i < mmap0->total_disks; i++) { if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 && (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0) migr_global = 0; } if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) { /* Disabled disk, useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); } else if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { /* Failed disk, almost useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->status == INTEL_S_UNINITIALIZED && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->migr_type == INTEL_MT_INIT && migr_global) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) || migr_global) { /* Resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_GEN_MIGR) { if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED); if (na == meta->total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, meta->total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_DISABLED) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) { g_raid_md_write_intel(md, NULL, NULL, NULL); meta = mdi->mdio_meta; } /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0; pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0; if (mvol->cng_master_disk < mmap->total_disks) pv->pv_cng_master_disk = mvol->cng_master_disk; vol->v_md_data = pv; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_domains >= 2 && mmap->total_domains <= mmap->total_disks) { /* Assume total_domains is correct. */ if (mmap->total_domains == mmap->total_disks) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID1) { /* total_domains looks wrong. */ if (mmap->total_disks <= 2) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[DISK_IDENT_SIZE]; int len, error; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; disk_pos = 0; g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor != 0x8086) { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } else { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (intel_get_disk_sectors(&meta->disk[disk_pos]) != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju", intel_get_disk_sectors(&meta->disk[disk_pos]), (off_t)(pp->mediasize / pp->sectorsize)); goto fail1; } G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; mdi->mdio_orig_config_id = meta->orig_config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_intel_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t off, size, sectorsize, strip, disk_sectors; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } g_raid_get_disk_info(disk); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = 0; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } numdisks = vol1->v_disks_count; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (disk_sectors * 512 < size) size = disk_sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ if (off % strip != 0) { size -= strip - off % strip; off += strip - off % strip; } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_raid_kill_consumer(sc, cp); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state, stale; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_DISABLED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED | INTEL_F_DISABLED; } else { if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->orig_config_id = mdi->mdio_orig_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; if (pd->pd_disk_meta.sectors_hi != 0) meta->attributes |= INTEL_ATTR_2TB_DISK; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ pv->pv_volume_pos = vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else if ((vol->v_disks_count & 1) == 0) meta->attributes |= INTEL_ATTR_RAID10; else meta->attributes |= INTEL_ATTR_RAID1E; if (pv->pv_cng) meta->attributes |= INTEL_ATTR_RAIDCNG; if (vol->v_strip_size > 131072) meta->attributes |= INTEL_ATTR_EXT_STRIP; if (pv->pv_cng) cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; mvol->state = (INTEL_ST_READ_COALESCING | INTEL_ST_WRITE_COALESCING); mvol->tid = vol->v_global_id + 1; if (pv->pv_cng) { mvol->state |= INTEL_ST_CLONE_N_GO; if (pv->pv_cng_man_sync) mvol->state |= INTEL_ST_CLONE_MAN_SYNC; mvol->cng_master_disk = pv->pv_cng_master_disk; if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state == G_RAID_SUBDISK_S_NONE) mvol->cng_state = INTEL_CNGST_MASTER_MISSING; else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE; else mvol->cng_state = INTEL_CNGST_UPDATED; } /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; stale = 0; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) stale = 1; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; /* mvol->migr_type = INTEL_MT_REPAIR; */ mvol->migr_type = INTEL_MT_VERIFY; mvol->state |= INTEL_ST_VERIFY_AND_FIX; } else mvol->migr_state = 0; mvol->dirty = (vol->v_dirty || stale); mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ intel_set_map_offset(mmap0, sd->sd_offset / sectorsize); intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize); mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) == g_raid_nsubdisks(vol, -1)) mmap0->status = INTEL_S_UNINITIALIZED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->total_domains = 2; else mmap0->total_domains = 1; intel_set_map_stripe_count(mmap0, sd->sd_size / vol->v_strip_size / mmap0->total_domains); mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { intel_set_vol_curr_migr_unit(mvol, pos / vol->v_strip_size / mmap0->total_domains); mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && sd->sd_state != G_RAID_SUBDISK_S_STALE && sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (vi > 1 || meta->attributes & (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB)) version = INTEL_VERSION_1300; if (strcmp(version, INTEL_VERSION_1300) < 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_intel(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_intel_pervolume *pv; pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data; free(pv, M_MD_INTEL); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(intel, "Intel"); Index: head/sys/geom/raid/md_jmicron.c =================================================================== --- head/sys/geom/raid/md_jmicron.c (revision 350693) +++ head/sys/geom/raid/md_jmicron.c (revision 350694) @@ -1,1565 +1,1566 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); #define JMICRON_MAX_DISKS 8 #define JMICRON_MAX_SPARE 2 struct jmicron_raid_conf { u_int8_t signature[2]; #define JMICRON_MAGIC "JM" u_int16_t version; #define JMICRON_VERSION 0x0001 u_int16_t checksum; u_int8_t filler_1[10]; u_int32_t disk_id; u_int32_t offset; u_int32_t disk_sectors_high; u_int16_t disk_sectors_low; u_int8_t filler_2[2]; u_int8_t name[16]; u_int8_t type; #define JMICRON_T_RAID0 0 #define JMICRON_T_RAID1 1 #define JMICRON_T_RAID01 2 #define JMICRON_T_CONCAT 3 #define JMICRON_T_RAID5 5 u_int8_t stripe_shift; u_int16_t flags; #define JMICRON_F_READY 0x0001 #define JMICRON_F_BOOTABLE 0x0002 #define JMICRON_F_BADSEC 0x0004 #define JMICRON_F_ACTIVE 0x0010 #define JMICRON_F_UNSYNC 0x0020 #define JMICRON_F_NEWEST 0x0040 u_int8_t filler_3[4]; u_int32_t spare[JMICRON_MAX_SPARE]; u_int32_t disks[JMICRON_MAX_DISKS]; #define JMICRON_DISK_MASK 0xFFFFFFF0 #define JMICRON_SEG_MASK 0x0000000F u_int8_t filler_4[32]; u_int8_t filler_5[384]; }; struct g_raid_md_jmicron_perdisk { struct jmicron_raid_conf *pd_meta; int pd_disk_pos; int pd_disk_id; off_t pd_disk_size; }; struct g_raid_md_jmicron_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; struct jmicron_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_jmicron; static g_raid_md_taste_t g_raid_md_taste_jmicron; static g_raid_md_event_t g_raid_md_event_jmicron; static g_raid_md_ctl_t g_raid_md_ctl_jmicron; static g_raid_md_write_t g_raid_md_write_jmicron; static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; static g_raid_md_free_t g_raid_md_free_jmicron; static kobj_method_t g_raid_md_jmicron_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), { 0, 0 } }; static struct g_raid_md_class g_raid_md_jmicron_class = { "JMicron", g_raid_md_jmicron_methods, sizeof(struct g_raid_md_jmicron_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) { int k; if (g_raid_debug < 1) return; printf("********* ATA JMicron RAID Metadata *********\n"); printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); printf("version %04x\n", meta->version); printf("checksum 0x%04x\n", meta->checksum); printf("disk_id 0x%08x\n", meta->disk_id); printf("offset 0x%08x\n", meta->offset); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); printf("name <%.16s>\n", meta->name); printf("type %d\n", meta->type); printf("stripe_shift %d\n", meta->stripe_shift); printf("flags %04x\n", meta->flags); printf("spare "); for (k = 0; k < JMICRON_MAX_SPARE; k++) printf(" 0x%08x", meta->spare[k]); printf("\n"); printf("disks "); for (k = 0; k < JMICRON_MAX_DISKS; k++) printf(" 0x%08x", meta->disks[k]); printf("\n"); printf("=================================================\n"); } static struct jmicron_raid_conf * jmicron_meta_copy(struct jmicron_raid_conf *meta) { struct jmicron_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int jmicron_meta_total_disks(struct jmicron_raid_conf *meta) { int pos; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if (meta->disks[pos] == 0) break; } return (pos); } static int jmicron_meta_total_spare(struct jmicron_raid_conf *meta) { int pos, n; n = 0; for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if (meta->spare[pos] != 0) n++; } return (n); } /* * Generate fake Configuration ID based on disk IDs. * Note: it will change after each disk set change. */ static uint32_t jmicron_meta_config_id(struct jmicron_raid_conf *meta) { int pos; uint32_t config_id; config_id = 0; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) config_id += meta->disks[pos] << pos; return (config_id); } static void jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static int jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) { int pos; id &= JMICRON_DISK_MASK; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) return (pos); } for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) return (-3); } return (-1); } static struct jmicron_raid_conf * jmicron_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct jmicron_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct jmicron_raid_conf *)buf; /* Check if this is an JMicron RAID struct */ if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); free(meta, M_MD_JMICRON); return (NULL); } return (meta); } static int jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static int jmicron_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static struct g_raid_disk * g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_jmicron_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_jmicron_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); if (!force) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); if (!force) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd, *oldpd; struct jmicron_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); else disk_pos = -1; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* Update global metadata just in case. */ meta->disks[disk_pos] = pd->pd_disk_id; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes/offsets, * especially in concat mode. Update. */ if (!resurrection) { sd->sd_offset = (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ sd->sd_size = (((off_t)pd->pd_meta->disk_sectors_high << 16) + pd->pd_meta->disk_sectors_low) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { /* Cold-inserted or rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { /* Dirty or resyncing disk.. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_jmicron_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_JMICRON); } static void g_raid_md_jmicron_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_JMICRON, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_jmicron_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ jmicron_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; size *= 512; //ZZZ vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == JMICRON_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; vol->v_mediasize = size * mdi->mdio_total_disks; } else if (meta->type == JMICRON_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; vol->v_mediasize = size; } else if (meta->type == JMICRON_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_mediasize = size * mdi->mdio_total_disks / 2; } else if (meta->type == JMICRON_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; vol->v_mediasize = 0; } else if (meta->type == JMICRON_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; vol->v_mediasize = size * (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_mediasize = 0; } vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_id = meta->disks[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_jmicron_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct jmicron_raid_conf *pdmeta; struct g_raid_md_jmicron_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_jmicron_start_disk(disk)) g_raid_md_write_jmicron(md, NULL, NULL, NULL); } else { /* * If we haven't started yet - update common metadata * to get subdisks details, avoiding data from spare disks. */ if (mdi->mdio_meta == NULL || jmicron_meta_find_disk(mdi->mdio_meta, mdi->mdio_meta->disk_id) == -3) { if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = jmicron_meta_copy(pdmeta); mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); } mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", mdi->mdio_disks_present, mdi->mdio_total_disks, jmicron_meta_total_spare(mdi->mdio_meta)); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks + jmicron_meta_total_spare(mdi->mdio_meta)) g_raid_md_jmicron_start(sc); } } static void g_raid_jmicron_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_jmicron_object *mdi; char name[16]; mdi = (struct g_raid_md_jmicron_object *)md; mdi->mdio_config_id = arc4random(); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_jmicron_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct jmicron_raid_conf *meta; struct g_raid_md_jmicron_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); mdi = (struct g_raid_md_jmicron_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = jmicron_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x197b) { G_RAID_DEBUG(1, "No JMicron metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "JMicron vendor mismatch 0x%04x != 0x197b", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); if (disk_pos == -1) { G_RAID_DEBUG(1, "JMicron disk_id %08x not found", meta->disk_id); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_jmicron_print(meta); G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); spare = (disk_pos == -2) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; if (spare == 2) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == jmicron_meta_config_id(meta)) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = jmicron_meta_config_id(meta); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_jmicron_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; } else { pd->pd_disk_pos = -1; pd->pd_disk_id = meta->disk_id; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_jmicron_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_JMICRON); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_jmicron_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_jmicron(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_jmicron_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) jmicron_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_jmicron(md, NULL, disk); continue; } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ jmicron_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); strncpy(meta->signature, JMICRON_MAGIC, 2); meta->version = JMICRON_VERSION; jmicron_meta_put_name(meta, vol->v_name); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = JMICRON_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = JMICRON_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = JMICRON_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = JMICRON_T_CONCAT; else meta->type = JMICRON_T_RAID5; meta->stripe_shift = fls(vol->v_strip_size / 2048); meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) meta->disks[i] = 0xffffffff; else { pd = (struct g_raid_md_jmicron_perdisk *) sd->sd_disk->d_md_data; meta->disks[i] = pd->pd_disk_id; } if (sd->sd_state < G_RAID_SUBDISK_S_STALE) meta->flags |= JMICRON_F_BADSEC; if (vol->v_dirty) meta->flags |= JMICRON_F_UNSYNC; } /* Put spares to their slots. */ spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_SPARE) continue; meta->spare[spares] = pd->pd_disk_id; if (++spares >= 2) break; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } pd->pd_meta = jmicron_meta_copy(meta); pd->pd_meta->disk_id = pd->pd_disk_id; if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { pd->pd_meta->offset = (sd->sd_offset / 512) / 16; pd->pd_meta->disk_sectors_high = (sd->sd_size / 512) >> 16; pd->pd_meta->disk_sectors_low = (sd->sd_size / 512) & 0xffff; if (sd->sd_state < G_RAID_SUBDISK_S_STALE) pd->pd_meta->flags &= ~JMICRON_F_BADSEC; else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) pd->pd_meta->flags |= JMICRON_F_UNSYNC; } G_RAID_DEBUG(1, "Writing JMicron metadata to %s", g_raid_get_diskname(disk)); g_raid_md_jmicron_print(pd->pd_meta); jmicron_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_jmicron_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); if (tdisk->d_consumer) jmicron_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } static int g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_jmicron_perdisk *pd; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } free(pd, M_MD_JMICRON); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_jmicron(struct g_raid_md_object *md) { struct g_raid_md_jmicron_object *mdi; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(jmicron, "JMicron"); Index: head/sys/geom/raid/md_nvidia.c =================================================================== --- head/sys/geom/raid/md_nvidia.c (revision 350693) +++ head/sys/geom/raid/md_nvidia.c (revision 350694) @@ -1,1585 +1,1586 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); struct nvidia_raid_conf { uint8_t nvidia_id[8]; #define NVIDIA_MAGIC "NVIDIA " uint32_t config_size; uint32_t checksum; uint16_t version; uint8_t disk_number; uint8_t dummy_0; uint32_t total_sectors; uint32_t sector_size; uint8_t name[16]; uint8_t revision[4]; uint32_t disk_status; uint32_t magic_0; #define NVIDIA_MAGIC0 0x00640044 uint64_t volume_id[2]; uint8_t state; #define NVIDIA_S_IDLE 0 #define NVIDIA_S_INIT 2 #define NVIDIA_S_REBUILD 3 #define NVIDIA_S_UPGRADE 4 #define NVIDIA_S_SYNC 5 uint8_t array_width; uint8_t total_disks; uint8_t orig_array_width; uint16_t type; #define NVIDIA_T_RAID0 0x0080 #define NVIDIA_T_RAID1 0x0081 #define NVIDIA_T_RAID3 0x0083 #define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ #define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ #define NVIDIA_T_RAID10 0x008a #define NVIDIA_T_RAID01 0x8180 #define NVIDIA_T_CONCAT 0x00ff uint16_t dummy_3; uint32_t strip_sectors; uint32_t strip_bytes; uint32_t strip_shift; uint32_t strip_mask; uint32_t stripe_sectors; uint32_t stripe_bytes; uint32_t rebuild_lba; uint32_t orig_type; uint32_t orig_total_sectors; uint32_t status; #define NVIDIA_S_BOOTABLE 0x00000001 #define NVIDIA_S_DEGRADED 0x00000002 uint32_t filler[98]; } __packed; struct g_raid_md_nvidia_perdisk { struct nvidia_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_nvidia_object { struct g_raid_md_object mdio_base; uint64_t mdio_volume_id[2]; struct nvidia_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_nvidia; static g_raid_md_taste_t g_raid_md_taste_nvidia; static g_raid_md_event_t g_raid_md_event_nvidia; static g_raid_md_ctl_t g_raid_md_ctl_nvidia; static g_raid_md_write_t g_raid_md_write_nvidia; static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; static g_raid_md_free_t g_raid_md_free_nvidia; static kobj_method_t g_raid_md_nvidia_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), { 0, 0 } }; static struct g_raid_md_class g_raid_md_nvidia_class = { "NVIDIA", g_raid_md_nvidia_methods, sizeof(struct g_raid_md_nvidia_object), .mdc_enable = 1, .mdc_priority = 100 }; static int NVIDIANodeID = 1; static void g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA NVIDIA RAID Metadata *********\n"); printf("nvidia_id <%.8s>\n", meta->nvidia_id); printf("config_size %u\n", meta->config_size); printf("checksum 0x%08x\n", meta->checksum); printf("version 0x%04x\n", meta->version); printf("disk_number %d\n", meta->disk_number); printf("dummy_0 0x%02x\n", meta->dummy_0); printf("total_sectors %u\n", meta->total_sectors); printf("sector_size %u\n", meta->sector_size); printf("name <%.16s>\n", meta->name); printf("revision 0x%02x%02x%02x%02x\n", meta->revision[0], meta->revision[1], meta->revision[2], meta->revision[3]); printf("disk_status 0x%08x\n", meta->disk_status); printf("magic_0 0x%08x\n", meta->magic_0); printf("volume_id 0x%016jx%016jx\n", meta->volume_id[1], meta->volume_id[0]); printf("state 0x%02x\n", meta->state); printf("array_width %u\n", meta->array_width); printf("total_disks %u\n", meta->total_disks); printf("orig_array_width %u\n", meta->orig_array_width); printf("type 0x%04x\n", meta->type); printf("dummy_3 0x%04x\n", meta->dummy_3); printf("strip_sectors %u\n", meta->strip_sectors); printf("strip_bytes %u\n", meta->strip_bytes); printf("strip_shift %u\n", meta->strip_shift); printf("strip_mask 0x%08x\n", meta->strip_mask); printf("stripe_sectors %u\n", meta->stripe_sectors); printf("stripe_bytes %u\n", meta->stripe_bytes); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("orig_type 0x%04x\n", meta->orig_type); printf("orig_total_sectors %u\n", meta->orig_total_sectors); printf("status 0x%08x\n", meta->status); printf("=================================================\n"); } static struct nvidia_raid_conf * nvidia_meta_copy(struct nvidia_raid_conf *meta) { struct nvidia_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) { int disk_pos; if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { disk_pos = (md_disk_pos / meta->array_width) + (md_disk_pos % meta->array_width) * meta->array_width; } else disk_pos = md_disk_pos; return (disk_pos); } static void nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct nvidia_raid_conf * nvidia_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct nvidia_raid_conf *meta; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct nvidia_raid_conf *)buf; /* Check if this is an NVIDIA RAID struct */ if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 128 || meta->config_size < 30) { G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } /* Check volume state. */ if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", pp->name, meta->state); free(meta, M_MD_NVIDIA); return (NULL); } /* Check raid type. */ if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && meta->type != NVIDIA_T_RAID5_SYM && meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_NVIDIA); return (NULL); } return (meta); } static int nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write metadata. */ error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static int nvidia_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static struct g_raid_disk * g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_nvidia_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_nvidia_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd, *oldpd; struct nvidia_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) { disk_pos = pd->pd_meta->disk_number; if (disk_pos >= meta->total_disks || mdi->mdio_started) disk_pos = -3; } else disk_pos = -3; /* For RAID0+1 we need to translate order. */ disk_pos = nvidia_meta_translate_disk(meta, disk_pos); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 2 * 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); // else // g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == NVIDIA_T_CONCAT) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->state == NVIDIA_S_REBUILD && (pd->pd_meta->disk_status & 0x100)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else if (meta->state == NVIDIA_S_SYNC) { /* Resyncing/dirty disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_nvidia_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_NVIDIA); } static void g_raid_md_nvidia_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_NVIDIA, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_nvidia_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ nvidia_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == NVIDIA_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == NVIDIA_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == NVIDIA_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == NVIDIA_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == NVIDIA_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == NVIDIA_T_RAID5_SYM) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_nvidia_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct nvidia_raid_conf *pdmeta; struct g_raid_md_nvidia_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_nvidia_start_disk(disk)) g_raid_md_write_nvidia(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = nvidia_meta_copy(pdmeta); mdi->mdio_total_disks = pdmeta->total_disks; mdi->mdio_disks_present = 1; } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else G_RAID_DEBUG1(1, sc, "Spare disk"); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_nvidia_start(sc); } } static void g_raid_nvidia_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_nvidia_object *mdi; char name[32]; mdi = (struct g_raid_md_nvidia_object *)md; arc4rand(&mdi->mdio_volume_id, 16, 0); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_nvidia_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct nvidia_raid_conf *meta; struct g_raid_md_nvidia_perdisk *pd; struct g_geom *geom; int result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); mdi = (struct g_raid_md_nvidia_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = nvidia_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x10de) { G_RAID_DEBUG(1, "No NVIDIA metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "NVIDIA vendor mismatch 0x%04x != 0x10de", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ g_raid_md_nvidia_print(meta); G_RAID_DEBUG(1, "NVIDIA disk position %d", meta->disk_number); spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (memcmp(&mdi1->mdio_volume_id, &meta->volume_id, 16) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_nvidia_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_nvidia_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_NVIDIA); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) { /* Bump volume ID to drop missing disks. */ arc4rand(&mdi->mdio_volume_id, 16, 0); g_raid_md_nvidia_start(sc); } return (0); } return (-1); } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } if (mdi->mdio_started) { /* Bump volume ID to prevent disk resurrection. */ if (pd->pd_disk_pos >= 0) arc4rand(&mdi->mdio_volume_id, 16, 0); /* Write updated metadata to all disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); } /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_nvidia(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip, volsize; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_nvidia_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 2 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) volsize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) volsize = size; else if (level == G_RAID_VOLUME_RL_RAID5) volsize = size * (numdisks - 1); else { /* RAID1E */ volsize = ((size * numdisks) / strip / 2) * strip; } if (volsize > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; vol->v_mediasize = volsize; vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) nvidia_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_nvidia(md, NULL, disk); continue; } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ nvidia_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_SPARE && disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC) - 1); meta->config_size = 30; meta->version = 0x0064; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->sector_size = vol->v_sectorsize; nvidia_meta_put_name(meta, vol->v_name); meta->magic_0 = NVIDIA_MAGIC0; memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); meta->state = NVIDIA_S_IDLE; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->array_width = 1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->array_width = vol->v_disks_count - 1; else meta->array_width = vol->v_disks_count; meta->total_disks = vol->v_disks_count; meta->orig_array_width = meta->array_width; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = NVIDIA_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = NVIDIA_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = NVIDIA_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = NVIDIA_T_CONCAT; else if (vol->v_raid_level_qualifier == G_RAID_VOLUME_RLQ_R5LA) meta->type = NVIDIA_T_RAID5; else meta->type = NVIDIA_T_RAID5_SYM; meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; meta->strip_bytes = vol->v_strip_size; meta->strip_shift = ffs(meta->strip_sectors) - 1; meta->strip_mask = meta->strip_sectors - 1; meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; meta->rebuild_lba = 0; meta->orig_type = meta->type; meta->orig_total_sectors = meta->total_sectors; meta->status = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC || vol->v_dirty) && meta->state != NVIDIA_S_REBUILD) meta->state = NVIDIA_S_SYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) meta->state = NVIDIA_S_REBUILD; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = meta; spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } pd->pd_meta = nvidia_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { /* For RAID0+1 we need to translate order. */ pd->pd_meta->disk_number = nvidia_meta_translate_disk(meta, sd->sd_pos); if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { pd->pd_meta->disk_status = 0x100; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize * meta->array_width; } } else pd->pd_meta->disk_number = meta->total_disks + spares++; G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", g_raid_get_diskname(disk)); g_raid_md_nvidia_print(pd->pd_meta); nvidia_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_nvidia_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* Erase metadata to prevent disks's later resurrection. */ if (tdisk->d_consumer) nvidia_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } static int g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_nvidia_perdisk *pd; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } free(pd, M_MD_NVIDIA); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_nvidia(struct g_raid_md_object *md) { struct g_raid_md_nvidia_object *mdi; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(nvidia, "NVIDIA"); Index: head/sys/geom/raid/md_promise.c =================================================================== --- head/sys/geom/raid/md_promise.c (revision 350693) +++ head/sys/geom/raid/md_promise.c (revision 350694) @@ -1,2007 +1,2008 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t disk_rebuild; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint8_t magic_6; uint8_t sector_size; uint16_t magic_7; uint32_t magic_8[31]; uint32_t backup_time; uint16_t magic_9; uint32_t disk_offset_high; uint32_t disk_sectors_high; uint32_t disk_rebuild_high; uint16_t magic_10; uint32_t magic_11[3]; uint32_t filler3[284]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("disk_rebuild %u\n", meta->disk_rebuild); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("sector_size %u\n", meta->sector_size); printf("backup_time %d\n", meta->backup_time); printf("disk_offset_high 0x%08x\n", meta->disk_offset_high); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, off_t sectors, off_t *off, off_t *size) { off_t coff, csize, tmp; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { tmp = ((off_t)metaarr[j]->disk_offset_high << 32) + metaarr[j]->disk_offset; if (tmp >= coff) csize = MIN(csize, tmp - coff); } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = ((off_t)metaarr[i]->disk_offset_high << 32) + metaarr[i]->disk_offset + ((off_t)metaarr[i]->disk_sectors_high << 32) + metaarr[i]->disk_sectors; csize = sectors - coff; i++; } return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; if (pp->sectorsize * 4 > MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); return (subdisks); } next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Remove filler garbage from fields used in newer metadata. */ if (meta->disk_offset_high == 0x8b8c8d8e && meta->disk_sectors_high == 0x8788898a && meta->disk_rebuild_high == 0x83848586) { meta->disk_offset_high = 0; meta->disk_sectors_high = 0; if (meta->disk_rebuild == UINT32_MAX) meta->disk_rebuild_high = UINT32_MAX; else meta->disk_rebuild_high = 0; if (meta->total_sectors_high == 0x15161718) { meta->total_sectors_high = 0; meta->backup_time = 0; if (meta->rebuild_lba64 == 0x2122232425262728) meta->rebuild_lba64 = UINT64_MAX; } } if (meta->sector_size < 1 || meta->sector_size > 8) meta->sector_size = 1; /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; off_t off, size; int error, i, subdisk, fake; uint32_t checksum, *ptr; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset_high = off >> 32; meta->disk_offset = (uint32_t)off; meta->disk_sectors_high = size >> 32; meta->disk_sectors = (uint32_t)size; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; off_t tmp; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072; meta->disk_sectors_high = tmp >> 32; meta->disk_sectors = (uint32_t)tmp; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t eoff, esize, size; int disk_pos, md_disk_pos, i, resurrection = 0; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { /* Find disk position in metadata by its serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high << 32) + pd->pd_meta[sdn]->disk_offset) * 512; sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high << 32) + pd->pd_meta[sdn]->disk_sectors) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) + pd->pd_meta[sdn]->disk_rebuild) * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; u_int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize += ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512 * meta->sector_size; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (((off_t)meta->disk_offset_high << 32) + meta->disk_offset) * 512; sd->sd_size = (((off_t)meta->disk_sectors_high << 32) + meta->disk_sectors) * 512; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[subdisks - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / 512; meta->total_sectors_high = (vol->v_mediasize / 512) >> 32; meta->sector_size = vol->v_sectorsize / 512; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset_high = (sd->sd_offset / 512) >> 32; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors_high = (sd->sd_size / 512) >> 32; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = (sd->sd_rebuild_pos / 512) >> 32; pd->pd_meta[j]->disk_rebuild = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = 0; pd->pd_meta[j]->disk_rebuild = 0; } else { pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX; pd->pd_meta[j]->disk_rebuild = UINT32_MAX; } pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: head/sys/geom/raid/md_sii.c =================================================================== --- head/sys/geom/raid/md_sii.c (revision 350693) +++ head/sys/geom/raid/md_sii.c (revision 350694) @@ -1,1673 +1,1674 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct sii_raid_conf *)buf; /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); g_free(buf); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; u_int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: head/sys/geom/raid/tr_concat.c =================================================================== --- head/sys/geom/raid/tr_concat.c (revision 350693) +++ head/sys/geom/raid/tr_concat.c (revision 350694) @@ -1,355 +1,356 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data"); struct g_raid_tr_concat_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_concat; static g_raid_tr_event_t g_raid_tr_event_concat; static g_raid_tr_start_t g_raid_tr_start_concat; static g_raid_tr_stop_t g_raid_tr_stop_concat; static g_raid_tr_iostart_t g_raid_tr_iostart_concat; static g_raid_tr_iodone_t g_raid_tr_iodone_concat; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat; static g_raid_tr_free_t g_raid_tr_free_concat; static kobj_method_t g_raid_tr_concat_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_concat_class = { "CONCAT", g_raid_tr_concat_methods, sizeof(struct g_raid_tr_concat_object), .trc_enable = 1, .trc_priority = 50, .trc_accept_unmapped = 1 }; static int g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_concat_object *trs; trs = (struct g_raid_tr_concat_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT && !(tr->tro_volume->v_disks_count == 1 && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_concat(struct g_raid_volume *vol) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; off_t size; u_int s; int i, n, f; sc = vol->v_softc; trs = (struct g_raid_tr_concat_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { /* * Some metadata modules may not know CONCAT volume * mediasize until all disks connected. Recalculate. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT && G_RAID_VOLUME_S_ALIVE(s) && !G_RAID_VOLUME_S_ALIVE(vol->v_state)) { size = 0; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) size += vol->v_subdisks[i].sd_size; } vol->v_mediasize = size; } g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_start_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_stop_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_concat(vol); return (0); } static void g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, length, remain; u_int no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } offset = bp->bio_offset; remain = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", bp->bio_offset)); bioq_init(&queue); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %ju)", bp->bio_offset, bp->bio_length)); } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; char *addr; off_t offset, length, remain; int error, no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); offset = boffset; remain = blength; addr = virtual; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", boffset)); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset, length); if (error != 0) return (error); remain -= length; addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %zu)", boffset, blength)); } while (remain > 0); return (0); } static void g_raid_tr_iodone_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, pbp->bio_error); } } static int g_raid_tr_free_concat(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(concat, "CONCAT"); Index: head/sys/geom/raid/tr_raid0.c =================================================================== --- head/sys/geom/raid/tr_raid0.c (revision 350693) +++ head/sys/geom/raid/tr_raid0.c (revision 350694) @@ -1,337 +1,338 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data"); struct g_raid_tr_raid0_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_raid0; static g_raid_tr_event_t g_raid_tr_event_raid0; static g_raid_tr_start_t g_raid_tr_start_raid0; static g_raid_tr_stop_t g_raid_tr_stop_raid0; static g_raid_tr_iostart_t g_raid_tr_iostart_raid0; static g_raid_tr_iodone_t g_raid_tr_iodone_raid0; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0; static g_raid_tr_free_t g_raid_tr_free_raid0; static kobj_method_t g_raid_tr_raid0_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid0), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid0), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid0), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid0), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid0), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid0), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid0), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid0), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid0_class = { "RAID0", g_raid_tr_raid0_methods, sizeof(struct g_raid_tr_raid0_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static int g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_raid0_object *trs; trs = (struct g_raid_tr_raid0_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid0(struct g_raid_volume *vol) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; u_int s; int n, f; sc = vol->v_softc; trs = (struct g_raid_tr_raid0_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_start_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_stop_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_raid0(vol); return (0); } static void g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = bp->bio_offset / strip_size; /* Start position in stripe. */ start = bp->bio_offset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe start position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = bp->bio_length; bioq_init(&queue); do { length = MIN(strip_size - start, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; int error; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); addr = virtual; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = boffset / strip_size; /* Start position in stripe. */ start = boffset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe tart position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = blength; do { length = MIN(strip_size - start, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset + start, length); if (error != 0) return (error); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; addr += length; start = 0; } while (remain > 0); return (0); } static void g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, pbp->bio_error); } } static int g_raid_tr_free_raid0(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(raid0, "RAID0"); Index: head/sys/geom/raid/tr_raid1.c =================================================================== --- head/sys/geom/raid/tr_raid1.c (revision 350693) +++ head/sys/geom/raid/tr_raid1.c (revision 350694) @@ -1,986 +1,987 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" SYSCTL_DECL(_kern_geom_raid_raid1); #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1_REBUILD_CLUSTER_IDLE 100 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); #define TR_RAID1_NONE 0 #define TR_RAID1_REBUILD 1 #define TR_RAID1_RESYNC 2 #define TR_RAID1_F_DOING_SOME 0x1 #define TR_RAID1_F_LOCKED 0x2 #define TR_RAID1_F_ABORT 0x4 struct g_raid_tr_raid1_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1; static g_raid_tr_event_t g_raid_tr_event_raid1; static g_raid_tr_start_t g_raid_tr_start_raid1; static g_raid_tr_stop_t g_raid_tr_stop_raid1; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; static g_raid_tr_locked_t g_raid_tr_locked_raid1; static g_raid_tr_idle_t g_raid_tr_idle_raid1; static g_raid_tr_free_t g_raid_tr_free_raid1; static kobj_method_t g_raid_tr_raid1_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1_class = { "RAID1", g_raid_tr_raid1_methods, sizeof(struct g_raid_tr_raid1_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1_object *trs; struct g_raid_softc *sc; struct g_raid_subdisk *tsd, *bestsd; u_int s; int i, na, ns; sc = vol->v_softc; trs = (struct g_raid_tr_raid1_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { /* Make sure we have at least one ACTIVE disk. */ na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); if (na == 0) { /* * Critical situation! We have no any active disk! * Choose the best disk we have to make it active. */ bestsd = &vol->v_subdisks[0]; for (i = 1; i < vol->v_disks_count; i++) { tsd = &vol->v_subdisks[i]; if (tsd->sd_state > bestsd->sd_state) bestsd = tsd; else if (tsd->sd_state == bestsd->sd_state && (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = tsd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } } na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na > 0) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static void g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *good_sd; struct bio *bp; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) return; sd = trs->trso_failed_sd; good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); if (good_sd == NULL) { g_raid_tr_raid1_rebuild_abort(tr); return; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = sd->sd_rebuild_pos; bp->bio_length = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = good_sd; trs->trso_flags |= TR_RAID1_F_DOING_SOME; trs->trso_flags |= TR_RAID1_F_LOCKED; g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ bp->bio_offset, bp->bio_length, NULL, bp); } static void g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; trs->trso_type = TR_RAID1_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1(vol, NULL); } static void g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1_rebuild_done(trs); } static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; off_t len; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1_F_ABORT; if (trs->trso_flags & TR_RAID1_F_LOCKED) { trs->trso_flags &= ~TR_RAID1_F_LOCKED; len = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); g_raid_unlock_range(tr->tro_volume, sd->sd_rebuild_pos, len); } g_raid_tr_raid1_rebuild_done(trs); } } static void g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *fsd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No active disk to rebuild. night night."); return; } fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (fsd == NULL) { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } else { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } } } if (fsd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = fsd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", fsd->sd_volume->v_name, fsd->sd_pos, fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1_REBUILD; trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); trs->trso_meta_update = g_raid1_rebuild_meta_update; g_raid_tr_raid1_rebuild_some(tr); } static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; int na, nr; /* * If we're stopping, don't do anything. If we don't have at least one * good disk and one bad disk, we don't do anything. And if there's a * 'good disk' stored in the trs, then we're in progress and we punt. * If we make it past all these checks, we need to rebuild. */ vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_stopping) return; na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1_NONE: if (na == 0) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1_rebuild_start(tr); break; case TR_RAID1_REBUILD: if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1_rebuild_abort(tr); break; case TR_RAID1_RESYNC: break; } } static int g_raid_tr_event_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1(vol, NULL); return (0); } static int g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, u_int mask) { struct g_raid_subdisk *sd, *best; int i, prio, bestprio; best = NULL; bestprio = INT_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) continue; if ((mask & (1 << i)) != 0) continue; prio = G_RAID_SUBDISK_LOAD(sd); prio += min(sd->sd_recovery, 255) << 22; prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { best = sd; bestprio = prio; } } return (best); } static void g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_subdisk *sd; struct bio *cbp; sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { g_raid_iodone(bp, ENOMEM); return; } g_raid_subdisk_iostart(sd, cbp); } static void g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (bp->bio_offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1_rebuild_fair_io; g_raid_tr_raid1_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1_object *trs; uintptr_t *mask; int error, do_write; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { /* * This operation is part of a rebuild or resync operation. * See what work just got done, then schedule the next bit of * work, if any. Rebuild/resync is done a little bit at a * time. Either when a timeout happens, or after we get a * bunch of I/Os to the disk (to make sure an active system * will complete in a sane amount of time). * * We are setup to do differing amounts of work for each of * these cases. so long as the slabs is smallish (less than * 50 or so, I'd guess, but that's just a WAG), we shouldn't * have any bio starvation issues. For active disks, we do * 5MB of data, for inactive ones, we do 50MB. */ if (trs->trso_type == TR_RAID1_REBUILD) { if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1_F_ABORT) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(4, bp, "rebuild read done. %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(trs->trso_failed_sd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(4, bp, "rebuild write done. Error %d", bp->bio_error); nsd = trs->trso_failed_sd; if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1_F_ABORT) { if ((trs->trso_flags & TR_RAID1_F_ABORT) == 0) { g_raid_tr_raid1_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } rebuild_round_done: nsd = trs->trso_failed_sd; trs->trso_flags &= ~TR_RAID1_F_LOCKED; g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1_rebuild_meta_update; } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; g_raid_tr_raid1_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 1; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); if (pbp->bio_children == 1) do_write = 0; } /* * Find the other disk, and try to do the I/O to it. */ mask = (uintptr_t *)(&pbp->bio_driver2); if (pbp->bio_children == 1) { /* Save original subdisk. */ pbp->bio_driver1 = do_write ? sd : NULL; *mask = 0; } *mask |= 1 << sd->sd_pos; nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (pbp->bio_children == 2 && do_write) { sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, cbp->bio_offset, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && pbp->bio_children > 1 && pbp->bio_driver1 != NULL) { /* * If it was a read, and bio_children is >1, then we just * recovered the data from the second drive. We should try to * write that data to the first drive if sector remapping is * enabled. A write should put the data in a new place on the * disk, remapping the bad sector. Do we need to do that by * queueing a request to the main worker thread? It doesn't * affect the return code of this current read, and can be * done at our leisure. However, to make the code simpler, it * is done synchronously. */ G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); cbp = g_clone_bio(pbp); if (cbp != NULL) { g_destroy_bio(bp); cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(pbp->bio_driver1, cbp); return; } } if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } if (pbp->bio_driver1 != NULL) { ((struct g_raid_subdisk *)pbp->bio_driver1) ->sd_recovery--; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; int error, i, ok; vol = tr->tro_volume; error = 0; ok = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } error = g_raid_subdisk_kerneldump(sd, virtual, physical, offset, length); if (error == 0) ok++; } return (ok > 0 ? 0 : error); } static int g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; trs->trso_fair_io = g_raid1_rebuild_fair_io; trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; if (trs->trso_type == TR_RAID1_REBUILD) g_raid_tr_raid1_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1, "RAID1"); Index: head/sys/geom/raid/tr_raid1e.c =================================================================== --- head/sys/geom/raid/tr_raid1e.c (revision 350693) +++ head/sys/geom/raid/tr_raid1e.c (revision 350694) @@ -1,1244 +1,1245 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" #define N 2 SYSCTL_DECL(_kern_geom_raid_raid1e); #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1e_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1e_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1E_REBUILD_CLUSTER_IDLE 100 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1e_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1e_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); #define TR_RAID1E_NONE 0 #define TR_RAID1E_REBUILD 1 #define TR_RAID1E_RESYNC 2 #define TR_RAID1E_F_DOING_SOME 0x1 #define TR_RAID1E_F_LOCKED 0x2 #define TR_RAID1E_F_ABORT 0x4 struct g_raid_tr_raid1e_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ off_t trso_lock_pos; /* Locked range start. */ off_t trso_lock_len; /* Locked range length. */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1e; static g_raid_tr_event_t g_raid_tr_event_raid1e; static g_raid_tr_start_t g_raid_tr_start_raid1e; static g_raid_tr_stop_t g_raid_tr_stop_raid1e; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; static g_raid_tr_locked_t g_raid_tr_locked_raid1e; static g_raid_tr_idle_t g_raid_tr_idle_raid1e; static g_raid_tr_free_t g_raid_tr_free_raid1e; static kobj_method_t g_raid_tr_raid1e_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1e_class = { "RAID1E", g_raid_tr_raid1e_methods, sizeof(struct g_raid_tr_raid1e_object), .trc_enable = 1, .trc_priority = 200, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask); static inline void V2P(struct g_raid_volume *vol, off_t virt, int *disk, off_t *offset, off_t *start) { off_t nstrip; u_int strip_size; strip_size = vol->v_strip_size; /* Strip number. */ nstrip = virt / strip_size; /* Start position in strip. */ *start = virt % strip_size; /* Disk number. */ *disk = (nstrip * N) % vol->v_disks_count; /* Strip start position in disk. */ *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; } static inline void P2V(struct g_raid_volume *vol, int disk, off_t offset, off_t *virt, int *copy) { off_t nstrip, start; u_int strip_size; strip_size = vol->v_strip_size; /* Start position in strip. */ start = offset % strip_size; /* Physical strip number. */ nstrip = (offset / strip_size) * vol->v_disks_count + disk; /* Number of physical strip (copy) inside virtual strip. */ *copy = nstrip % N; /* Offset in virtual space. */ *virt = (nstrip / N) * strip_size + start; } static int g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count / N; i++) { bestsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } worstsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == vol->v_disks_count) return (G_RAID_VOLUME_S_OPTIMAL); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to STALE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); g_raid_write_metadata(sc, vol, sd, sd->sd_disk); } } state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count; i++) { bestsd = &vol->v_subdisks[i]; worstsd = &vol->v_subdisks[i]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; u_int s; sc = vol->v_softc; trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { if ((vol->v_disks_count % N) == 0) s = g_raid_tr_update_state_raid1e_even(vol); else s = g_raid_tr_update_state_raid1e_odd(vol); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } if (!trs->trso_starting && !trs->trso_stopping) g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); return (0); } static void g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { struct g_raid_volume *vol; vol = sd->sd_volume; /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < vol->v_disks_count) && (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; trs->trso_type = TR_RAID1E_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1e(vol, NULL); } static void g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1e_rebuild_done(trs); } static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1E_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1E_F_ABORT; if (trs->trso_flags & TR_RAID1E_F_LOCKED) { trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); } g_raid_tr_raid1e_rebuild_done(trs); } } static void g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio *bp; off_t len, virtual, vend, offset, start; int disk, copy, best; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) return; vol = tr->tro_volume; sc = vol->v_softc; sd = trs->trso_failed_sd; while (1) { if (sd->sd_rebuild_pos >= sd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Get virtual offset from physical rebuild position. */ P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); /* Get physical offset back to get first stripe position. */ V2P(vol, virtual, &disk, &offset, &start); /* Calculate contignous data length. */ len = MIN(g_raid1e_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); if ((vol->v_disks_count % N) != 0) len = MIN(len, vol->v_strip_size - start); /* Find disk with most accurate data. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset + start, len, 0); if (best < 0) { /* There is no any valid disk. */ g_raid_tr_raid1e_rebuild_abort(tr); return; } else if (best != copy) { /* Some other disk has better data. */ break; } /* We have the most accurate data. Skip the range. */ G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); sd->sd_rebuild_pos += len; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = offset + start + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); bp->bio_length = len; bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); /* * If we are crossing stripe boundary, correct affected virtual * range we should lock. */ if (start + len > vol->v_strip_size) { P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); len = vend - virtual; } trs->trso_flags |= TR_RAID1E_F_DOING_SOME; trs->trso_flags |= TR_RAID1E_F_LOCKED; trs->trso_lock_pos = virtual; trs->trso_lock_len = len; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); } static void g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (sd == NULL) { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } else { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } } } if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = sd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1E_REBUILD; trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); trs->trso_meta_update = g_raid1e_rebuild_meta_update; g_raid_tr_raid1e_rebuild_some(tr); } static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; int nr; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_stopping) return; nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1E_NONE: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1e_rebuild_start(tr); break; case TR_RAID1E_REBUILD: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1e_rebuild_abort(tr); break; case TR_RAID1E_RESYNC: break; } } static int g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1e(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } static int g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask) { struct g_raid_subdisk *sd; off_t offset; int i, best, prio, bestprio; best = -1; bestprio = INT_MAX; for (i = 0; i < N; i++) { sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; offset = off; if (no + i >= vol->v_disks_count) offset += vol->v_strip_size; prio = G_RAID_SUBDISK_LOAD(sd); if ((mask & (1 << sd->sd_pos)) != 0) continue; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_RESYNC: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ case G_RAID_SUBDISK_S_STALE: prio += i << 24; break; case G_RAID_SUBDISK_S_REBUILD: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ default: continue; } prio += min(sd->sd_recovery, 255) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { bestprio = prio; best = i; } } return (best); } static void g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int best; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); best = g_raid_tr_raid1e_select_read_disk(vol, no, offset, length, 0); KASSERT(best >= 0, ("No readable disk in volume %s!", vol->v_name)); no += best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no += N - best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } remain -= length; addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1e_rebuild_fair_io; g_raid_tr_raid1e_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1e_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1e_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1e_object *trs; off_t virtual, offset, start; uintptr_t mask; int error, do_write, copy, disk, best; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { if (trs->trso_type == TR_RAID1E_REBUILD) { nsd = trs->trso_failed_sd; if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1E_F_ABORT) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_offset = nsd->sd_rebuild_pos; G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(nsd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", bp->bio_error); if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1E_F_ABORT) { if ((trs->trso_flags & TR_RAID1E_F_ABORT) == 0) { g_raid_tr_raid1e_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } rebuild_round_done: trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1e_rebuild_meta_update; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_meta_update *= g_raid1e_rebuild_slab; trs->trso_meta_update /= vol->v_strip_size; } } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; /* Run next rebuild iteration. */ g_raid_tr_raid1e_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1E_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; mask = (intptr_t)bp->bio_caller2; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 0; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); else if (mask == 0) do_write = 1; /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find the other disk, and try to do the I/O to it. */ mask |= 1 << copy; best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; cbp->bio_ma = bp->bio_ma; cbp->bio_ma_offset = bp->bio_ma_offset; cbp->bio_ma_n = bp->bio_ma_n; g_destroy_bio(bp); nsd = &vol->v_subdisks[disk]; G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (do_write) mask |= 1 << 31; if ((mask & (1U << 31)) != 0) sd->sd_recovery++; cbp->bio_caller2 = (void *)mask; if (do_write) { cbp->bio_caller1 = nsd; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && (mask & (1U << 31)) != 0) { G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find best disk to write. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, ~mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; cbp->bio_caller2 = (void *)mask; g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); return; } } if ((mask & (1U << 31)) != 0) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); for (copy = 0; copy < N; copy++) { if ((mask & (1 << copy) ) != 0) vol->v_subdisks[(disk + copy) % vol->v_disks_count].sd_recovery--; } if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i, error; vol = tr->tro_volume; addr = virtual; strip_size = vol->v_strip_size; V2P(vol, boffset, &no, &offset, &start); remain = blength; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } error = g_raid_subdisk_kerneldump(sd, addr, 0, offset + start, length); if (error != 0) return (error); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; addr += length; start = 0; } return (0); } static int g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; trs->trso_fair_io = g_raid1e_rebuild_fair_io; trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_recover_slabs *= g_raid1e_rebuild_slab; trs->trso_recover_slabs /= vol->v_strip_size; } if (trs->trso_type == TR_RAID1E_REBUILD) g_raid_tr_raid1e_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1e, "RAID1E"); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 350693) +++ head/sys/geom/raid3/g_raid3.c (revision 350694) @@ -1,3586 +1,3587 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = 0; else { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > MAXPHYS) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_cat(sb, "PARITY"); else sbuf_cat(sb, "DATA"); sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_cat(sb, "0%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_cat(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_cat(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_cat(sb, ", "); \ else \ first = 0; \ sbuf_cat(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); MODULE_VERSION(geom_raid3, 0); Index: head/sys/geom/raid3/g_raid3.h =================================================================== --- head/sys/geom/raid3/g_raid3.h (revision 350693) +++ head/sys/geom/raid3/g_raid3.h (revision 350694) @@ -1,478 +1,460 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_RAID3_H_ #define _G_RAID3_H_ #include #include #define G_RAID3_CLASS_NAME "RAID3" #define G_RAID3_MAGIC "GEOM::RAID3" /* * Version history: * 0 - Initial version number. * 1 - Added 'round-robin reading' algorithm. * 2 - Added 'verify reading' algorithm. * 3 - Added md_genid field to metadata. * 4 - Added md_provsize field to metadata. * 5 - Added 'no failure synchronization' flag. */ #define G_RAID3_VERSION 5 #define G_RAID3_DISK_FLAG_DIRTY 0x0000000000000001ULL #define G_RAID3_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL #define G_RAID3_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL #define G_RAID3_DISK_FLAG_HARDCODED 0x0000000000000008ULL #define G_RAID3_DISK_FLAG_BROKEN 0x0000000000000010ULL #define G_RAID3_DISK_FLAG_MASK (G_RAID3_DISK_FLAG_DIRTY | \ G_RAID3_DISK_FLAG_SYNCHRONIZING | \ G_RAID3_DISK_FLAG_FORCE_SYNC) #define G_RAID3_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_RAID3_DEVICE_FLAG_ROUND_ROBIN 0x0000000000000002ULL #define G_RAID3_DEVICE_FLAG_VERIFY 0x0000000000000004ULL #define G_RAID3_DEVICE_FLAG_NOFAILSYNC 0x0000000000000008ULL #define G_RAID3_DEVICE_FLAG_MASK (G_RAID3_DEVICE_FLAG_NOAUTOSYNC | \ G_RAID3_DEVICE_FLAG_ROUND_ROBIN | \ G_RAID3_DEVICE_FLAG_VERIFY | \ G_RAID3_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern u_int g_raid3_debug; -#define G_RAID3_DEBUG(lvl, ...) do { \ - if (g_raid3_debug >= (lvl)) { \ - printf("GEOM_RAID3"); \ - if (g_raid3_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_RAID3_LOGREQ(lvl, bp, ...) do { \ - if (g_raid3_debug >= (lvl)) { \ - printf("GEOM_RAID3"); \ - if (g_raid3_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_RAID3_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_RAID3", g_raid3_debug, (lvl), NULL, __VA_ARGS__) +#define G_RAID3_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_RAID3", g_raid3_debug, (lvl), (bp), __VA_ARGS__) #define G_RAID3_BIO_CFLAG_REGULAR 0x01 #define G_RAID3_BIO_CFLAG_SYNC 0x02 #define G_RAID3_BIO_CFLAG_PARITY 0x04 #define G_RAID3_BIO_CFLAG_NODISK 0x08 #define G_RAID3_BIO_CFLAG_REGSYNC 0x10 #define G_RAID3_BIO_CFLAG_MASK (G_RAID3_BIO_CFLAG_REGULAR | \ G_RAID3_BIO_CFLAG_SYNC | \ G_RAID3_BIO_CFLAG_PARITY | \ G_RAID3_BIO_CFLAG_NODISK | \ G_RAID3_BIO_CFLAG_REGSYNC) #define G_RAID3_BIO_PFLAG_DEGRADED 0x01 #define G_RAID3_BIO_PFLAG_NOPARITY 0x02 #define G_RAID3_BIO_PFLAG_VERIFY 0x04 #define G_RAID3_BIO_PFLAG_MASK (G_RAID3_BIO_PFLAG_DEGRADED | \ G_RAID3_BIO_PFLAG_NOPARITY | \ G_RAID3_BIO_PFLAG_VERIFY) /* * Informations needed for synchronization. */ struct g_raid3_disk_sync { struct g_consumer *ds_consumer; /* Consumer connected to our device. */ off_t ds_offset; /* Offset of next request to send. */ off_t ds_offset_done; /* Offset of already synchronized region. */ off_t ds_resync; /* Resynchronize from this offset. */ u_int ds_syncid; /* Disk's synchronization ID. */ u_int ds_inflight; /* Number of in-flight sync requests. */ struct bio **ds_bios; /* BIOs for synchronization I/O. */ }; /* * Informations needed for synchronization. */ struct g_raid3_device_sync { struct g_geom *ds_geom; /* Synchronization geom. */ }; #define G_RAID3_DISK_STATE_NODISK 0 #define G_RAID3_DISK_STATE_NONE 1 #define G_RAID3_DISK_STATE_NEW 2 #define G_RAID3_DISK_STATE_ACTIVE 3 #define G_RAID3_DISK_STATE_STALE 4 #define G_RAID3_DISK_STATE_SYNCHRONIZING 5 #define G_RAID3_DISK_STATE_DISCONNECTED 6 #define G_RAID3_DISK_STATE_DESTROY 7 struct g_raid3_disk { u_int d_no; /* Disk number. */ struct g_consumer *d_consumer; /* Consumer. */ struct g_raid3_softc *d_softc; /* Back-pointer to softc. */ int d_state; /* Disk state. */ uint64_t d_flags; /* Additional flags. */ u_int d_genid; /* Disk's generation ID. */ struct g_raid3_disk_sync d_sync; /* Sync information. */ LIST_ENTRY(g_raid3_disk) d_next; }; #define d_name d_consumer->provider->name #define G_RAID3_EVENT_DONTWAIT 0x1 #define G_RAID3_EVENT_WAIT 0x2 #define G_RAID3_EVENT_DEVICE 0x4 #define G_RAID3_EVENT_DONE 0x8 struct g_raid3_event { struct g_raid3_disk *e_disk; int e_state; int e_flags; int e_error; TAILQ_ENTRY(g_raid3_event) e_next; }; #define G_RAID3_DEVICE_FLAG_DESTROY 0x0100000000000000ULL #define G_RAID3_DEVICE_FLAG_WAIT 0x0200000000000000ULL #define G_RAID3_DEVICE_FLAG_DESTROYING 0x0400000000000000ULL #define G_RAID3_DEVICE_STATE_STARTING 0 #define G_RAID3_DEVICE_STATE_DEGRADED 1 #define G_RAID3_DEVICE_STATE_COMPLETE 2 /* Bump syncid on first write. */ #define G_RAID3_BUMP_SYNCID 0x1 /* Bump genid immediately. */ #define G_RAID3_BUMP_GENID 0x2 enum g_raid3_zones { G_RAID3_ZONE_64K, G_RAID3_ZONE_16K, G_RAID3_ZONE_4K, G_RAID3_NUM_ZONES }; static __inline enum g_raid3_zones g_raid3_zone(size_t nbytes) { if (nbytes > 65536) return (G_RAID3_NUM_ZONES); else if (nbytes > 16384) return (G_RAID3_ZONE_64K); else if (nbytes > 4096) return (G_RAID3_ZONE_16K); else return (G_RAID3_ZONE_4K); }; struct g_raid3_softc { u_int sc_state; /* Device state. */ uint64_t sc_mediasize; /* Device size. */ uint32_t sc_sectorsize; /* Sector size. */ uint64_t sc_flags; /* Additional flags. */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* Device unique ID. */ struct sx sc_lock; struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; struct proc *sc_worker; struct bio_queue_head sc_regular_delayed; /* Delayed I/O requests due collision with sync requests. */ struct bio_queue_head sc_inflight; /* In-flight regular write requests. */ struct bio_queue_head sc_sync_delayed; /* Delayed sync requests due collision with regular requests. */ struct g_raid3_disk *sc_disks; u_int sc_ndisks; /* Number of disks. */ u_int sc_round_robin; struct g_raid3_disk *sc_syncdisk; struct g_raid3_zone { uma_zone_t sz_zone; size_t sz_inuse; size_t sz_max; u_int sz_requested; u_int sz_failed; } sc_zones[G_RAID3_NUM_ZONES]; u_int sc_genid; /* Generation ID. */ u_int sc_syncid; /* Synchronization ID. */ int sc_bump_id; struct g_raid3_device_sync sc_sync; int sc_idle; /* DIRTY flags removed. */ time_t sc_last_write; u_int sc_writes; TAILQ_HEAD(, g_raid3_event) sc_events; struct mtx sc_events_mtx; struct callout sc_callout; struct root_hold_token *sc_rootmount; }; #define sc_name sc_geom->name const char *g_raid3_get_diskname(struct g_raid3_disk *disk); u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state); #define G_RAID3_DESTROY_SOFT 0 #define G_RAID3_DESTROY_DELAYED 1 #define G_RAID3_DESTROY_HARD 2 int g_raid3_destroy(struct g_raid3_softc *sc, int how); int g_raid3_event_send(void *arg, int state, int flags); struct g_raid3_metadata; int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md); int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md); void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md); int g_raid3_clear_metadata(struct g_raid3_disk *disk); void g_raid3_update_metadata(struct g_raid3_disk *disk); g_ctl_req_t g_raid3_config; #endif /* _KERNEL */ struct g_raid3_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Device name. */ uint32_t md_id; /* Device unique ID. */ uint16_t md_no; /* Component number. */ uint16_t md_all; /* Number of disks in device. */ uint32_t md_genid; /* Generation ID. */ uint32_t md_syncid; /* Synchronization ID. */ uint64_t md_mediasize; /* Size of whole device. */ uint32_t md_sectorsize; /* Sector size. */ uint64_t md_sync_offset; /* Synchronized offset. */ uint64_t md_mflags; /* Additional device flags. */ uint64_t md_dflags; /* Additional disk flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void raid3_metadata_encode(struct g_raid3_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, 16); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); le32enc(data + 44, md->md_genid); le32enc(data + 48, md->md_syncid); le64enc(data + 52, md->md_mediasize); le32enc(data + 60, md->md_sectorsize); le64enc(data + 64, md->md_sync_offset); le64enc(data + 72, md->md_mflags); le64enc(data + 80, md->md_dflags); bcopy(md->md_provider, data + 88, 16); le64enc(data + 104, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 112); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 112, 16); } static __inline int raid3_metadata_decode_v0v1v2(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_syncid = le32dec(data + 44); md->md_mediasize = le64dec(data + 48); md->md_sectorsize = le32dec(data + 56); md->md_sync_offset = le64dec(data + 60); md->md_mflags = le64dec(data + 68); md->md_dflags = le64dec(data + 76); bcopy(data + 84, md->md_provider, 16); bcopy(data + 100, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 100); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 100, 16) != 0) return (EINVAL); /* New fields. */ md->md_genid = 0; md->md_provsize = 0; return (0); } static __inline int raid3_metadata_decode_v3(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_genid = le32dec(data + 44); md->md_syncid = le32dec(data + 48); md->md_mediasize = le64dec(data + 52); md->md_sectorsize = le32dec(data + 60); md->md_sync_offset = le64dec(data + 64); md->md_mflags = le64dec(data + 72); md->md_dflags = le64dec(data + 80); bcopy(data + 88, md->md_provider, 16); bcopy(data + 104, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 104); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 104, 16) != 0) return (EINVAL); /* New fields. */ md->md_provsize = 0; return (0); } static __inline int raid3_metadata_decode_v4v5(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_genid = le32dec(data + 44); md->md_syncid = le32dec(data + 48); md->md_mediasize = le64dec(data + 52); md->md_sectorsize = le32dec(data + 60); md->md_sync_offset = le64dec(data + 64); md->md_mflags = le64dec(data + 72); md->md_dflags = le64dec(data + 80); bcopy(data + 88, md->md_provider, 16); md->md_provsize = le64dec(data + 104); bcopy(data + 112, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 112); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 112, 16) != 0) return (EINVAL); return (0); } static __inline int raid3_metadata_decode(const u_char *data, struct g_raid3_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: case 1: case 2: error = raid3_metadata_decode_v0v1v2(data, md); break; case 3: error = raid3_metadata_decode_v3(data, md); break; case 4: case 5: error = raid3_metadata_decode_v4v5(data, md); break; default: error = EINVAL; break; } return (error); } static __inline void raid3_metadata_dump(const struct g_raid3_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" name: %s\n", md->md_name); printf(" id: %u\n", (u_int)md->md_id); printf(" no: %u\n", (u_int)md->md_no); printf(" all: %u\n", (u_int)md->md_all); printf(" genid: %u\n", (u_int)md->md_genid); printf(" syncid: %u\n", (u_int)md->md_syncid); printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); printf(" mflags:"); if (md->md_mflags == 0) printf(" NONE"); else { if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) printf(" NOAUTOSYNC"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) printf(" ROUND-ROBIN"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0) printf(" VERIFY"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) printf(" NOFAILSYNC"); } printf("\n"); printf(" dflags:"); if (md->md_dflags == 0) printf(" NONE"); else { if ((md->md_dflags & G_RAID3_DISK_FLAG_DIRTY) != 0) printf(" DIRTY"); if ((md->md_dflags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) printf(" SYNCHRONIZING"); if ((md->md_dflags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) printf(" FORCE_SYNC"); } printf("\n"); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_RAID3_H_ */ Index: head/sys/geom/sched/g_sched.c =================================================================== --- head/sys/geom/sched/g_sched.c (revision 350693) +++ head/sys/geom/sched/g_sched.c (revision 350694) @@ -1,1728 +1,1729 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Main control module for geom-based disk schedulers ('sched'). * * USER VIEW * A 'sched' node is typically inserted transparently between * an existing provider pp and its original geom gp * * [pp --> gp ..] * * using the command "geom sched insert " and * resulting in the following topology * * [pp --> sched_gp --> cp] [new_pp --> gp ... ] * * Deletion "geom sched destroy .sched." restores the * original chain. The normal "geom sched create " * is also supported. * * INTERNALS * Internally, the 'sched' uses the following data structures * * geom{} g_sched_softc{} g_gsched{} * +----------+ +---------------+ +-------------+ * | softc *-|--->| sc_gsched *-|-->| gs_init | * | ... | | | | gs_fini | * | | | [ hash table] | | gs_start | * +----------+ | | | ... | * | | +-------------+ * | | * | | g_*_softc{} * | | +-------------+ * | sc_data *-|-->| | * +---------------+ | algorithm- | * | specific | * +-------------+ * * A g_sched_softc{} is created with a "geom sched insert" call. * In turn this instantiates a specific scheduling algorithm, * which sets sc_gsched to point to the algorithm callbacks, * and calls gs_init() to create the g_*_softc{} . * The other callbacks (gs_start, gs_next, ...) are invoked * as needed * * g_sched_softc{} is defined in g_sched.h and mostly used here; * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) * * DATA MOVING * When a bio is received on the provider, it goes to the * g_sched_start() which calls gs_start() to initially queue it; * then we call g_sched_dispatch() that loops around gs_next() * to select zero or more bio's to be sent downstream. * * g_sched_dispatch() can also be called as a result of a timeout, * e.g. when doing anticipation or pacing requests. * * When a bio comes back, it goes to g_sched_done() which in turn * calls gs_done(). The latter does any necessary housekeeping in * the scheduling algorithm, and may decide to call g_sched_dispatch() * to send more bio's downstream. * * If an algorithm needs per-flow queues, these are created * calling gs_init_class() and destroyed with gs_fini_class(), * and they are also inserted in the hash table implemented in * the g_sched_softc{} * * If an algorithm is replaced, or a transparently-inserted node is * removed with "geom sched destroy", we need to remove all references * to the g_*_softc{} and g_sched_softc from the bio's still in * the scheduler. g_sched_forced_dispatch() helps doing this. * XXX need to explain better. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* we access curthread */ #include +#include #include "gs_scheduler.h" #include "g_sched.h" /* geom hooks */ /* * Size of the per-geom hash table storing traffic classes. * We may decide to change it at a later time, it has no ABI * implications as it is only used for run-time allocations. */ #define G_SCHED_HASH_SIZE 32 static int g_sched_destroy(struct g_geom *gp, boolean_t force); static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb); static struct g_geom *g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_sched_init(struct g_class *mp); static void g_sched_fini(struct g_class *mp); static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); struct g_class g_sched_class = { .name = G_SCHED_CLASS_NAME, .version = G_VERSION, .ctlreq = g_sched_config, .taste = g_sched_taste, .destroy_geom = g_sched_destroy_geom, .init = g_sched_init, .ioctl = g_sched_ioctl, .fini = g_sched_fini }; MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); /* * Global variables describing the state of the geom_sched module. * There is only one static instance of this structure. */ LIST_HEAD(gs_list, g_gsched); /* type, link field */ struct geom_sched_vars { struct mtx gs_mtx; struct gs_list gs_scheds; /* list of algorithms */ u_int gs_debug; u_int gs_sched_count; /* how many algorithms ? */ u_int gs_patched; /* g_io_request was patched */ u_int gs_initialized; u_int gs_expire_secs; /* expiration of hash entries */ struct bio_queue_head gs_pending; u_int gs_npending; /* The following are for stats, usually protected by gs_mtx. */ u_long gs_requests; /* total requests */ u_long gs_done; /* total done */ u_int gs_in_flight; /* requests in flight */ u_int gs_writes_in_flight; u_int gs_bytes_in_flight; u_int gs_write_bytes_in_flight; char gs_names[256]; /* names of schedulers */ }; static struct geom_sched_vars me = { .gs_expire_secs = 10, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, "GEOM_SCHED stuff"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, &me.gs_bytes_in_flight, 0, "Bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, &me.gs_writes_in_flight, 0, "Write Requests in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, &me.gs_in_flight, 0, "Requests in flight"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, &me.gs_done, 0, "Total done"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, &me.gs_requests, 0, "Total requests"); SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, &me.gs_names, 0, "Algorithm names"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, &me.gs_sched_count, 0, "Number of algorithms"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, &me.gs_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, &me.gs_expire_secs, 0, "Expire time in seconds"); /* * g_sched calls the scheduler algorithms with this lock held. * The locking functions are exposed so the scheduler algorithms can also * protect themselves e.g. when running a callout handler. */ void g_sched_lock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_lock(&sc->sc_mtx); } void g_sched_unlock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_unlock(&sc->sc_mtx); } /* * Support functions to handle references to the module, * which are coming from devices using this scheduler. */ static inline void g_gsched_ref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, 1); } static inline void g_gsched_unref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, -1); } /* * Update the stats when this request is done. */ static void g_sched_update_stats(struct bio *bio) { me.gs_done++; me.gs_in_flight--; me.gs_bytes_in_flight -= bio->bio_length; if (bio->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight--; me.gs_write_bytes_in_flight -= bio->bio_length; } } /* * Dispatch any pending request. */ static void g_sched_forced_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during forced dispatch")); while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * The main dispatch loop, called either here after the start * routine, or by scheduling algorithms when they receive a timeout * or a 'done' notification. Does not share code with the forced * dispatch path, since the gs_done() callback can call us. */ void g_sched_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); if ((sc->sc_flags & G_SCHED_FLUSHING)) return; while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * Recent (8.0 and above) versions of FreeBSD have support to * register classifiers of disk requests. The classifier is * invoked by g_io_request(), and stores the information into * bp->bio_classifier1. * * Support for older versions, which is left here only for * documentation purposes, relies on two hacks: * 1. classification info is written into the bio_caller1 * field of the topmost node in the bio chain. This field * is rarely used, but this module is incompatible with * those that use bio_caller1 for other purposes, * such as ZFS and gjournal; * 2. g_io_request() is patched in-memory when the module is * loaded, so that the function calls a classifier as its * first thing. g_io_request() is restored when the module * is unloaded. This functionality is only supported for * x86 and amd64, other architectures need source code changes. */ /* * Lookup the identity of the issuer of the original request. * In the current implementation we use the curthread of the * issuer, but different mechanisms may be implemented later * so we do not make assumptions on the return value which for * us is just an opaque identifier. */ static inline u_long g_sched_classify(struct bio *bp) { /* we have classifier fields in the struct bio */ return ((u_long)bp->bio_classifier1); } /* Return the hash chain for the given key. */ static inline struct g_hash * g_sched_hash(struct g_sched_softc *sc, u_long key) { return (&sc->sc_hash[key & sc->sc_mask]); } /* * Helper function for the children classes, which takes * a geom and a bio and returns the private descriptor * associated to the request. This involves fetching * the classification field and [al]locating the * corresponding entry in the hash table. */ void * g_sched_get_class(struct g_geom *gp, struct bio *bp) { struct g_sched_softc *sc; struct g_sched_class *gsc; struct g_gsched *gsp; struct g_hash *bucket; u_long key; sc = gp->softc; key = g_sched_classify(bp); bucket = g_sched_hash(sc, key); LIST_FOREACH(gsc, bucket, gsc_clist) { if (key == gsc->gsc_key) { gsc->gsc_refs++; return (gsc->gsc_priv); } } gsp = sc->sc_gsched; gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, M_GEOM_SCHED, M_NOWAIT | M_ZERO); if (!gsc) return (NULL); if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { free(gsc, M_GEOM_SCHED); return (NULL); } gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ gsc->gsc_key = key; LIST_INSERT_HEAD(bucket, gsc, gsc_clist); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; return (gsc->gsc_priv); } /* * Release a reference to the per-client descriptor, */ void g_sched_put_class(struct g_geom *gp, void *priv) { struct g_sched_class *gsc; struct g_sched_softc *sc; gsc = g_sched_priv2class(priv); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; if (--gsc->gsc_refs > 0) return; sc = gp->softc; sc->sc_gsched->gs_fini_class(sc->sc_data, priv); LIST_REMOVE(gsc, gsc_clist); free(gsc, M_GEOM_SCHED); } static void g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, struct g_gsched *gsp, void *data) { struct g_sched_class *cp, *cp2; int i; if (!hp) return; if (data && gsp->gs_hash_unref) gsp->gs_hash_unref(data); for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) g_sched_put_class(gp, cp->gsc_priv); } hashdestroy(hp, M_GEOM_SCHED, mask); } static struct g_hash * g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) { struct g_hash *hash; if (gsp->gs_priv_size == 0) return (NULL); hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); return (hash); } static void g_sched_flush_classes(struct g_geom *gp) { struct g_sched_softc *sc; struct g_sched_class *cp, *cp2; int i; sc = gp->softc; if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) return; for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) g_sched_put_class(gp, cp->gsc_priv); } } sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; } /* * Wait for the completion of any outstanding request. To ensure * that this does not take forever the caller has to make sure that * no new request enter the scehduler before calling us. * * Must be called with the gp mutex held and topology locked. */ static int g_sched_wait_pending(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; int endticks = ticks + hz; g_topology_assert(); while (sc->sc_pending && endticks - ticks >= 0) msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); return (sc->sc_pending ? ETIMEDOUT : 0); } static int g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) { struct g_sched_softc *sc = gp->softc; int error; /* Set the flushing flag: new bios will not enter the scheduler. */ sc->sc_flags |= G_SCHED_FLUSHING; g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) goto failed; /* No more requests pending or in flight from the old gsp. */ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; /* * Avoid deadlock here by releasing the gp mutex and reacquiring * it once done. It should be safe, since no reconfiguration or * destruction can take place due to the geom topology lock; no * new request can use the current sc_data since we flagged the * geom as being flushed. */ g_sched_unlock(gp); gsp->gs_fini(sc->sc_data); g_sched_lock(gp); sc->sc_gsched = NULL; sc->sc_data = NULL; g_gsched_unref(gsp); failed: sc->sc_flags &= ~G_SCHED_FLUSHING; return (error); } static int g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) { int error; g_sched_lock(gp); error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ g_sched_unlock(gp); return (error); } /* * Support function for create/taste -- locate the desired * algorithm and grab a reference to it. */ static struct g_gsched * g_gsched_find(const char *name) { struct g_gsched *gsp = NULL; mtx_lock(&me.gs_mtx); LIST_FOREACH(gsp, &me.gs_scheds, glist) { if (strcmp(name, gsp->gs_name) == 0) { g_gsched_ref(gsp); break; } } mtx_unlock(&me.gs_mtx); return (gsp); } /* * Rebuild the list of scheduler names. * To be called with me.gs_mtx lock held. */ static void g_gsched_build_names(struct g_gsched *gsp) { int pos, l; struct g_gsched *cur; pos = 0; LIST_FOREACH(cur, &me.gs_scheds, glist) { l = strlen(cur->gs_name); if (l + pos + 1 + 1 < sizeof(me.gs_names)) { if (pos != 0) me.gs_names[pos++] = ' '; strcpy(me.gs_names + pos, cur->gs_name); pos += l; } } me.gs_names[pos] = '\0'; } /* * Register or unregister individual scheduling algorithms. */ static int g_gsched_register(struct g_gsched *gsp) { struct g_gsched *cur; int error = 0; mtx_lock(&me.gs_mtx); LIST_FOREACH(cur, &me.gs_scheds, glist) { if (strcmp(gsp->gs_name, cur->gs_name) == 0) break; } if (cur != NULL) { G_SCHED_DEBUG(0, "A scheduler named %s already" "exists.", gsp->gs_name); error = EEXIST; } else { LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); gsp->gs_refs = 1; me.gs_sched_count++; g_gsched_build_names(gsp); } mtx_unlock(&me.gs_mtx); return (error); } struct g_gsched_unregparm { struct g_gsched *gup_gsp; int gup_error; }; static void g_gsched_unregister(void *arg, int flag) { struct g_gsched_unregparm *parm = arg; struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; struct g_sched_softc *sc; struct g_geom *gp, *gp_tmp; int error; parm->gup_error = 0; g_topology_assert(); if (flag == EV_CANCEL) return; mtx_lock(&me.gs_mtx); LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { if (gp->class != &g_sched_class) continue; /* Should not happen. */ sc = gp->softc; if (sc->sc_gsched == gsp) { error = g_sched_remove(gp, gsp); if (error) goto failed; } } LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { if (cur != gsp) continue; if (gsp->gs_refs != 1) { G_SCHED_DEBUG(0, "%s still in use.", gsp->gs_name); parm->gup_error = EBUSY; } else { LIST_REMOVE(gsp, glist); me.gs_sched_count--; g_gsched_build_names(gsp); } break; } if (cur == NULL) { G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); parm->gup_error = ENOENT; } failed: mtx_unlock(&me.gs_mtx); } static inline void g_gsched_global_init(void) { if (!me.gs_initialized) { G_SCHED_DEBUG(0, "Initializing global data."); mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); LIST_INIT(&me.gs_scheds); bioq_init(&me.gs_pending); me.gs_initialized = 1; } } /* * Module event called when a scheduling algorithm module is loaded or * unloaded. */ int g_gsched_modevent(module_t mod, int cmd, void *arg) { struct g_gsched *gsp = arg; struct g_gsched_unregparm parm; int error; G_SCHED_DEBUG(0, "Modevent %d.", cmd); /* * If the module is loaded at boot, the geom thread that calls * g_sched_init() might actually run after g_gsched_modevent(), * so make sure that the module is properly initialized. */ g_gsched_global_init(); error = EOPNOTSUPP; switch (cmd) { case MOD_LOAD: error = g_gsched_register(gsp); G_SCHED_DEBUG(0, "Loaded module %s error %d.", gsp->gs_name, error); if (error == 0) g_retaste(&g_sched_class); break; case MOD_UNLOAD: parm.gup_gsp = gsp; parm.gup_error = 0; error = g_waitfor_event(g_gsched_unregister, &parm, M_WAITOK, NULL); if (error == 0) error = parm.gup_error; G_SCHED_DEBUG(0, "Unloaded module %s error %d.", gsp->gs_name, error); break; } return (error); } #ifdef KTR #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) static inline char g_sched_type(struct bio *bp) { if (bp->bio_cmd == BIO_READ) return ('R'); else if (bp->bio_cmd == BIO_WRITE) return ('W'); return ('U'); } static inline void g_sched_trace_bio_START(struct bio *bp) { CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } static inline void g_sched_trace_bio_DONE(struct bio *bp) { CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } #else /* !KTR */ #define TRC_BIO_EVENT(e, bp) #endif /* !KTR */ /* * g_sched_done() and g_sched_start() dispatch the geom requests to * the scheduling algorithm in use. */ static void g_sched_done(struct bio *bio) { struct g_geom *gp = bio->bio_caller2; struct g_sched_softc *sc = gp->softc; TRC_BIO_EVENT(DONE, bio); KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); g_sched_lock(gp); g_sched_update_stats(bio); sc->sc_gsched->gs_done(sc->sc_data, bio); if (!--sc->sc_pending) wakeup(gp); g_sched_flush_classes(gp); g_sched_unlock(gp); g_std_done(bio); } static void g_sched_start(struct bio *bp) { struct g_geom *gp = bp->bio_to->geom; struct g_sched_softc *sc = gp->softc; struct bio *cbp; TRC_BIO_EVENT(START, bp); G_SCHED_LOGREQ(bp, "Request received."); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_sched_done; cbp->bio_to = LIST_FIRST(&gp->provider); KASSERT(cbp->bio_to != NULL, ("NULL provider")); /* We only schedule reads and writes. */ if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE) goto bypass; G_SCHED_LOGREQ(cbp, "Sending request."); g_sched_lock(gp); /* * Call the algorithm's gs_start to queue the request in the * scheduler. If gs_start fails then pass the request down, * otherwise call g_sched_dispatch() which tries to push * one or more requests down. */ if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || sc->sc_gsched->gs_start(sc->sc_data, cbp)) { g_sched_unlock(gp); goto bypass; } /* * We use bio_caller1 to mark requests that are scheduled * so make sure it is not NULL. */ if (cbp->bio_caller1 == NULL) cbp->bio_caller1 = &me; /* anything not NULL */ cbp->bio_caller2 = gp; sc->sc_pending++; /* Update general stats. */ me.gs_in_flight++; me.gs_requests++; me.gs_bytes_in_flight += bp->bio_length; if (bp->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight++; me.gs_write_bytes_in_flight += bp->bio_length; } g_sched_dispatch(gp); g_sched_unlock(gp); return; bypass: cbp->bio_done = g_std_done; cbp->bio_caller1 = NULL; /* not scheduled */ g_io_request(cbp, LIST_FIRST(&gp->consumer)); } /* * The next few functions are the geom glue. */ static void g_sched_orphan(struct g_consumer *cp) { g_topology_assert(); g_sched_destroy(cp->geom, 1); } static int g_sched_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_sched_temporary_start(struct bio *bio) { mtx_lock(&me.gs_mtx); me.gs_npending++; bioq_disksort(&me.gs_pending, bio); mtx_unlock(&me.gs_mtx); } static void g_sched_flush_pending(g_start_t *start) { struct bio *bp; while ((bp = bioq_takefirst(&me.gs_pending))) start(bp); } static int g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) { struct g_sched_softc *sc = gp->softc; g_start_t *saved_start, *flush = g_sched_start; int error = 0, endticks = ticks + hz; g_cancel_event(newpp); /* prevent taste() */ /* copy private fields */ newpp->private = pp->private; newpp->index = pp->index; /* Queue all the early requests coming for us. */ me.gs_npending = 0; saved_start = pp->geom->start; dstgp->start = g_sched_temporary_start; while (pp->nstart - pp->nend != me.gs_npending && endticks - ticks >= 0) tsleep(pp, PRIBIO, "-", hz/10); if (pp->nstart - pp->nend != me.gs_npending) { flush = saved_start; error = ETIMEDOUT; goto fail; } /* link pp to this geom */ LIST_REMOVE(pp, provider); pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); /* * replicate the counts from the parent in the * new provider and consumer nodes */ cp->acr = newpp->acr = pp->acr; cp->acw = newpp->acw = pp->acw; cp->ace = newpp->ace = pp->ace; sc->sc_flags |= G_SCHED_PROXYING; fail: dstgp->start = saved_start; g_sched_flush_pending(flush); return (error); } /* * Create a geom node for the device passed as *pp. * If successful, add a reference to this gsp. */ static int g_sched_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp, int proxy) { struct g_sched_softc *sc = NULL; struct g_geom *gp, *dstgp; struct g_provider *newpp = NULL; struct g_consumer *cp = NULL; char name[64]; int error; g_topology_assert(); snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Geom %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_gsched = gsp; sc->sc_data = gsp->gs_init(gp); if (sc->sc_data == NULL) { error = ENOMEM; goto fail; } sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); /* * Do not initialize the flush mechanism, will be initialized * on the first insertion on the hash table. */ mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); gp->softc = sc; gp->start = g_sched_start; gp->orphan = g_sched_orphan; gp->access = g_sched_access; gp->dumpconf = g_sched_dumpconf; newpp = g_new_providerf(dstgp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; cp = g_new_consumer(gp); error = g_attach(cp, proxy ? newpp : pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } g_error_provider(newpp, 0); if (proxy) { error = g_insert_proxy(gp, newpp, dstgp, pp, cp); if (error) goto fail; } G_SCHED_DEBUG(0, "Device %s created.", gp->name); g_gsched_ref(gsp); return (0); fail: if (cp != NULL) { if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (newpp != NULL) g_destroy_provider(newpp); if (sc->sc_hash) g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); if (sc->sc_data) gsp->gs_fini(sc->sc_data); g_free(gp->softc); g_destroy_geom(gp); return (error); } /* * Support for dynamic switching of scheduling algorithms. * First initialize the data structures for the new algorithm, * then call g_sched_remove_locked() to flush all references * to the old one, finally link the new algorithm. */ static int g_sched_change_algo(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp) { struct g_sched_softc *sc; struct g_geom *gp; struct g_hash *newh; void *data; u_long mask; int error = 0; gp = pp->geom; sc = gp->softc; data = gsp->gs_init(gp); if (data == NULL) return (ENOMEM); newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); if (gsp->gs_priv_size && !newh) { error = ENOMEM; goto fail; } g_sched_lock(gp); if (sc->sc_gsched) { /* can be NULL in some cases */ error = g_sched_remove_locked(gp, sc->sc_gsched); if (error) goto fail; } g_gsched_ref(gsp); sc->sc_gsched = gsp; sc->sc_data = data; sc->sc_hash = newh; sc->sc_mask = mask; g_sched_unlock(gp); return (0); fail: if (newh) g_sched_hash_fini(gp, newh, mask, gsp, data); if (data) gsp->gs_fini(data); g_sched_unlock(gp); return (error); } /* * Stop the request flow directed to the proxy, redirecting the new * requests to the me.gs_pending queue. */ static struct g_provider * g_detach_proxy(struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp, *newpp; do { pp = LIST_FIRST(&gp->provider); if (pp == NULL) break; cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; me.gs_npending = 0; pp->geom->start = g_sched_temporary_start; return (pp); } while (0); printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); return (NULL); } static void g_sched_blackhole(struct bio *bp) { g_io_deliver(bp, ENXIO); } static inline void g_reparent_provider(struct g_provider *pp, struct g_geom *gp, struct g_provider *newpp) { LIST_REMOVE(pp, provider); if (newpp) { pp->private = newpp->private; pp->index = newpp->index; } pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); } static inline void g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) { struct g_geom *gp = oldpp->geom; g_reparent_provider(oldpp, newpp->geom, newpp); /* * Hackish: let the system destroy the old provider for us, just * in case someone attached a consumer to it, in which case a * direct call to g_destroy_provider() would not work. */ g_reparent_provider(newpp, gp, NULL); } /* * Complete the proxy destruction, linking the old provider to its * original geom, and destroying the proxy provider. Also take care * of issuing the pending requests collected in me.gs_pending (if any). */ static int g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) { struct g_consumer *cp; struct g_provider *newpp; do { cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; /* Relink the provider to its original geom. */ g_unproxy_provider(oldpp, newpp); /* Detach consumer from provider, and destroy provider. */ cp->acr = newpp->acr = 0; cp->acw = newpp->acw = 0; cp->ace = newpp->ace = 0; g_detach(cp); /* Send the pending bios through the right start function. */ g_sched_flush_pending(oldpp->geom->start); return (0); } while (0); printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); /* We cannot send the pending bios anywhere... */ g_sched_flush_pending(g_sched_blackhole); return (EINVAL); } static int g_sched_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp, *oldpp = NULL; struct g_sched_softc *sc; struct g_gsched *gsp; int error; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); if (!(sc->sc_flags & G_SCHED_PROXYING)) { pp = LIST_FIRST(&gp->provider); if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { const char *msg = force ? "but we force removal" : "cannot remove"; G_SCHED_DEBUG(!force, "Device %s is still open (r%dw%de%d), %s.", pp->name, pp->acr, pp->acw, pp->ace, msg); if (!force) return (EBUSY); } else { G_SCHED_DEBUG(0, "Device %s removed.", gp->name); } } else oldpp = g_detach_proxy(gp); gsp = sc->sc_gsched; if (gsp) { /* * XXX bad hack here: force a dispatch to release * any reference to the hash table still held by * the scheduler. */ g_sched_lock(gp); /* * We are dying here, no new requests should enter * the scheduler. This is granted by the topolgy, * either in case we were proxying (new bios are * being redirected) or not (see the access check * above). */ g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) { /* * Not all the requests came home: this might happen * under heavy load, or if we were waiting for any * bio which is served in the event path (see * geom_slice.c for an example of how this can * happen). Try to restore a working configuration * if we can fail. */ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { g_sched_flush_pending(force ? g_sched_blackhole : g_sched_start); } /* * In the forced destroy case there is not so much * we can do, we have pending bios that will call * g_sched_done() somehow, and we don't want them * to crash the system using freed memory. We tell * the user that something went wrong, and leak some * memory here. * Note: the callers using force = 1 ignore the * return value. */ if (force) { G_SCHED_DEBUG(0, "Pending requests while " " destroying geom, some memory leaked."); } return (error); } g_sched_unlock(gp); g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; gsp->gs_fini(sc->sc_data); g_gsched_unref(gsp); sc->sc_gsched = NULL; } else error = 0; if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { error = g_destroy_proxy(gp, oldpp); if (error) { if (force) { G_SCHED_DEBUG(0, "Unrecoverable error while " "destroying a proxy geom, leaking some " " memory."); } return (error); } } mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (error); } static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_sched_destroy(gp, 0)); } /* * Functions related to the classification of requests. * * On recent FreeBSD versions (8.0 and above), we store a reference * to the issuer of a request in bp->bio_classifier1 as soon * as the bio is posted to the geom queue (and not later, because * requests are managed by the g_down thread afterwards). */ /* * Classifier support for recent FreeBSD versions: we use * a very simple classifier, only use curthread to tag a request. * The classifier is registered at module load, and unregistered * at module unload. */ static int g_sched_tag(void *arg, struct bio *bp) { bp->bio_classifier1 = curthread; return (1); } static struct g_classifier_hook g_sched_classifier = { .func = g_sched_tag, }; static inline void g_classifier_ini(void) { g_register_classifier(&g_sched_classifier); } static inline void g_classifier_fini(void) { g_unregister_classifier(&g_sched_classifier); } static void g_sched_init(struct g_class *mp) { g_gsched_global_init(); G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", mp, &g_sched_class); /* Patch g_io_request to store classification info in the bio. */ g_classifier_ini(); } static void g_sched_fini(struct g_class *mp) { g_classifier_fini(); G_SCHED_DEBUG(0, "Unloading..."); KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); mtx_destroy(&me.gs_mtx); } static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_geom *gp; cp = LIST_FIRST(&pp->geom->consumer); if (cp == NULL) return (ENOIOCTL); gp = cp->provider->geom; if (gp->ioctl == NULL) return (ENOIOCTL); return (gp->ioctl(cp->provider, cmd, data, fflag, td)); } /* * Read the i-th argument for a request, skipping the /dev/ * prefix if present. */ static const char * g_sched_argi(struct gctl_req *req, int i) { static const char *dev_prefix = "/dev/"; const char *name; char param[16]; int l = strlen(dev_prefix); snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) gctl_error(req, "No 'arg%d' argument", i); else if (strncmp(name, dev_prefix, l) == 0) name += l; return (name); } /* * Fetch nargs and do appropriate checks. */ static int g_sched_get_nargs(struct gctl_req *req) { int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return (0); } if (*nargs <= 0) gctl_error(req, "Missing device(s)."); return (*nargs); } /* * Check whether we should add the class on certain volumes when * this geom is created. Right now this is under control of a kenv * variable containing the names of all devices that we care about. * Probably we should only support transparent insertion as the * preferred mode of operation. */ static struct g_geom * g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_gsched *gsp = NULL; /* the . algorithm we want */ const char *s; /* generic string pointer */ const char *taste_names; /* devices we like */ int l; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_SCHED_DEBUG(2, "Tasting %s.", pp->name); do { /* do not taste on ourselves */ if (pp->geom->class == mp) break; taste_names = kern_getenv("geom.sched.taste"); if (taste_names == NULL) break; l = strlen(pp->name); for (s = taste_names; *s && (s = strstr(s, pp->name)); s++) { /* further checks for an exact match */ if ( (s == taste_names || s[-1] == ' ') && (s[l] == '\0' || s[l] == ' ') ) break; } if (s == NULL) break; G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", pp->name, s); /* look up the provider name in the list */ s = kern_getenv("geom.sched.algo"); if (s == NULL) s = "rr"; gsp = g_gsched_find(s); /* also get a reference */ if (gsp == NULL) { G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); break; } /* XXX create with 1 as last argument ? */ g_sched_create(NULL, mp, pp, gsp, 0); g_gsched_unref(gsp); } while (0); return NULL; } static void g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_create(req, mp, pp, gsp, proxy) != 0) break; } g_gsched_unref(gsp); } static void g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_change_algo(req, mp, pp, gsp) != 0) break; } g_gsched_unref(gsp); } static struct g_geom * g_sched_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int nargs, *force, error, i; struct g_geom *gp; const char *name; g_topology_assert(); nargs = g_sched_get_nargs(req); force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; gp = g_sched_find_geom(mp, name); if (gp == NULL) { G_SCHED_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); break; } error = g_sched_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); break; } } } static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SCHED_VERSION) { gctl_error(req, "Userland and kernel parts are " "out of sync."); return; } if (strcmp(verb, "create") == 0) { g_sched_ctl_create(req, mp, 0); return; } else if (strcmp(verb, "insert") == 0) { g_sched_ctl_create(req, mp, 1); return; } else if (strcmp(verb, "configure") == 0) { g_sched_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_sched_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); } if (gsp != NULL && gsp->gs_dumpconf) gsp->gs_dumpconf(sb, indent, gp, cp, pp); } DECLARE_GEOM_CLASS(g_sched_class, g_sched); MODULE_VERSION(geom_sched, 0); Index: head/sys/geom/sched/g_sched.h =================================================================== --- head/sys/geom/sched/g_sched.h (revision 350693) +++ head/sys/geom/sched/g_sched.h (revision 350694) @@ -1,127 +1,111 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _G_SCHED_H_ #define _G_SCHED_H_ /* * $Id$ * $FreeBSD$ * * Header for the geom_sched class (userland library and kernel part). * See g_sched.c for documentation. * The userland code only needs the three G_SCHED_* values below. */ #define G_SCHED_CLASS_NAME "SCHED" #define G_SCHED_VERSION 0 #define G_SCHED_SUFFIX ".sched." #ifdef _KERNEL -#define G_SCHED_DEBUG(lvl, ...) do { \ - if (me.gs_debug >= (lvl)) { \ - printf("GEOM_SCHED"); \ - if (me.gs_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) - -#define G_SCHED_LOGREQ(bp, ...) do { \ - if (me.gs_debug >= 2) { \ - printf("GEOM_SCHED[2]: "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_SCHED_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_SCHED", me.gs_debug, (lvl), NULL, __VA_ARGS__) +#define G_SCHED_LOGREQ(bp, ...) \ + _GEOM_DEBUG("GEOM_SCHED", me.gs_debug, 2, (bp), __VA_ARGS__) LIST_HEAD(g_hash, g_sched_class); /* * Descriptor of a scheduler. * In addition to the obvious fields, sc_flushing and sc_pending * support dynamic switching of scheduling algorithm. * Normally, sc_flushing is 0, and requests that are scheduled are * also added to the sc_pending queue, and removed when we receive * the 'done' event. * * When we are transparently inserted on an existing provider, * sc_proxying is set. The detach procedure is slightly different. * * When switching schedulers, sc_flushing is set so requests bypass us, * and at the same time we update the pointer in the pending bios * to ignore us when they return up. * XXX it would be more efficient to implement sc_pending with * a generation number: the softc generation is increased when * we change scheduling algorithm, we store the current generation * number in the pending bios, and when they come back we ignore * the done() call if the generation number do not match. */ struct g_sched_softc { /* * Generic fields used by any scheduling algorithm: * a mutex, the class descriptor, flags, list of pending * requests (used when flushing the module) and support * for hash tables where we store per-flow queues. */ struct mtx sc_mtx; struct g_gsched *sc_gsched; /* Scheduler descriptor. */ int sc_pending; /* Pending requests. */ int sc_flags; /* Various flags. */ /* * Hash tables to store per-flow queues are generally useful * so we handle them in the common code. * sc_hash and sc_mask are parameters of the hash table, * the last two fields are used to periodically remove * expired items from the hash table. */ struct g_hash *sc_hash; u_long sc_mask; int sc_flush_ticks; /* Next tick for a flush. */ int sc_flush_bucket; /* Next bucket to flush. */ /* * Pointer to the algorithm's private data, which is the value * returned by sc_gsched->gs_init() . A NULL here means failure. * XXX intptr_t might be more appropriate. */ void *sc_data; }; #define G_SCHED_PROXYING 1 #define G_SCHED_FLUSHING 2 #endif /* _KERNEL */ #endif /* _G_SCHED_H_ */ Index: head/sys/geom/shsec/g_shsec.c =================================================================== --- head/sys/geom/shsec/g_shsec.c (revision 350693) +++ head/sys/geom/shsec/g_shsec.c (revision 350694) @@ -1,839 +1,840 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_shsec, "GEOM shared secret device support"); static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data"); static uma_zone_t g_shsec_zone; static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force); static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_shsec_taste; static g_ctl_req_t g_shsec_config; static g_dumpconf_t g_shsec_dumpconf; static g_init_t g_shsec_init; static g_fini_t g_shsec_fini; struct g_class g_shsec_class = { .name = G_SHSEC_CLASS_NAME, .version = G_VERSION, .ctlreq = g_shsec_config, .taste = g_shsec_taste, .destroy_geom = g_shsec_destroy_geom, .init = g_shsec_init, .fini = g_shsec_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW, 0, "GEOM_SHSEC stuff"); static u_int g_shsec_debug = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0, "Debug level"); static u_int g_shsec_maxmem = MAXPHYS * 100; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_shsec_maxmem, 0, "Maximum memory that can be allocated for I/O (in bytes)"); static u_int g_shsec_alloc_failed = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD, &g_shsec_alloc_failed, 0, "How many times I/O allocation failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_shsec_init(struct g_class *mp __unused) { g_shsec_zone = uma_zcreate("g_shsec_zone", MAXPHYS, NULL, NULL, NULL, NULL, 0, 0); g_shsec_maxmem -= g_shsec_maxmem % MAXPHYS; uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / MAXPHYS); } static void g_shsec_fini(struct g_class *mp __unused) { uma_zdestroy(g_shsec_zone); } /* * Return the number of valid disks. */ static u_int g_shsec_nvalid(struct g_shsec_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_shsec_remove_disk(struct g_consumer *cp) { struct g_shsec_softc *sc; u_int no; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_shsec_softc *)cp->private; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); no = cp->index; G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); sc->sc_disks[no] = NULL; if (sc->sc_provider != NULL) { g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name); } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); } static void g_shsec_orphan(struct g_consumer *cp) { struct g_shsec_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_shsec_remove_disk(cp); /* If there are no valid disks anymore, remove device. */ if (g_shsec_nvalid(sc) == 0) g_shsec_destroy(sc, 1); } static int g_shsec_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2; struct g_shsec_softc *sc; struct g_geom *gp; int error; gp = pp->geom; sc = gp->softc; if (sc == NULL) { /* * It looks like geom is being withered. * In that case we allow only negative requests. */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("Positive access request (device=%s).", pp->name)); if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) { G_SHSEC_DEBUG(0, "Device %s definitely destroyed.", gp->name); } return (0); } /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; error = ENXIO; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, dr, dw, de); if (error == 0) continue; /* * If we fail here, backout all previous changes. */ LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) return (error); g_access(cp2, -dr, -dw, -de); } /* NOTREACHED */ } return (error); } static void g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) *dst = *dst ^ *src++; KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_done(struct bio *bp) { struct g_shsec_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_error == 0) G_SHSEC_LOGREQ(2, bp, "Request done."); else { G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; } if (pbp->bio_cmd == BIO_READ) { if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) { bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length); pbp->bio_pflags = 0; } else { g_shsec_xor1((uint32_t *)bp->bio_data, (uint32_t *)pbp->bio_data, (ssize_t)pbp->bio_length); } } bzero(bp->bio_data, bp->bio_length); uma_zfree(g_shsec_zone, bp->bio_data); g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } } static void g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) { *rand = arc4random(); *dst = *dst ^ *rand++; } KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_start(struct bio *bp) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_shsec_softc *sc; struct bio *cbp; uint32_t *dst; ssize_t len; u_int no; int error; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_shsec_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_SHSEC_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_FLUSH: /* * Only those requests are supported. */ break; case BIO_DELETE: case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } /* * Allocate all bios first and calculate XOR. */ dst = NULL; len = bp->bio_length; if (bp->bio_cmd == BIO_READ) bp->bio_pflags = G_SHSEC_BFLAG_FIRST; for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_shsec_done; cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT); if (cbp->bio_data == NULL) { g_shsec_alloc_failed++; error = ENOMEM; goto failure; } cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd == BIO_WRITE) { if (no == 0) { dst = (uint32_t *)cbp->bio_data; bcopy(bp->bio_data, dst, len); } else { g_shsec_xor2((uint32_t *)cbp->bio_data, dst, len); } } } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_SHSEC_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); } return; failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; if (cbp->bio_data != NULL) { bzero(cbp->bio_data, cbp->bio_length); uma_zfree(g_shsec_zone, cbp->bio_data); } g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } static void g_shsec_check_and_run(struct g_shsec_softc *sc) { off_t mediasize, ms; u_int no, sectorsize = 0; if (g_shsec_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name); /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; mediasize -= sc->sc_disks[0]->provider->sectorsize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { ms = sc->sc_disks[no]->provider->mediasize; ms -= sc->sc_disks[no]->provider->sectorsize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, sc->sc_disks[no]->provider->sectorsize); } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize; g_error_provider(sc->sc_provider, 0); G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name); } static int g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ shsec_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; struct g_shsec_metadata md; int error; /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } /* Reread metadata. */ error = g_shsec_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } cp->private = sc; cp->index = no; sc->sc_disks[no] = cp; G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_shsec_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md) { struct g_shsec_softc *sc; struct g_geom *gp; u_int no; G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_SHSEC_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO); gp->start = g_shsec_start; gp->spoiled = g_shsec_orphan; gp->orphan = g_shsec_orphan; gp->access = g_shsec_access; gp->dumpconf = g_shsec_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_SHSEC, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force) { struct g_provider *pp; struct g_geom *gp; u_int no; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_SHSEC_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_SHSEC_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } for (no = 0; no < sc->sc_ndisks; no++) { if (sc->sc_disks[no] != NULL) g_shsec_remove_disk(sc->sc_disks[no]); } gp = sc->sc_geom; gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_SHSEC); free(sc, M_SHSEC); pp = LIST_FIRST(&gp->provider); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_shsec_softc *sc; sc = gp->softc; return (g_shsec_destroy(sc, 0)); } static struct g_geom * g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_shsec_metadata md; struct g_shsec_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_SHSEC_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "shsec:taste"); gp->start = g_shsec_start; gp->access = g_shsec_access; gp->orphan = g_shsec_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_shsec_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0) return (NULL); if (md.md_version > G_SHSEC_VERSION) { G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 1) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_shsec_create(mp, &md); if (gp == NULL) { G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_shsec_destroy(sc, 1); return (NULL); } } return (gp); } static struct g_shsec_softc * g_shsec_find_device(struct g_class *mp, const char *name) { struct g_shsec_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_shsec_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_shsec_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_shsec_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SHSEC_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "stop") == 0) { g_shsec_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_shsec_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_shsec_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_shsec_class, g_shsec); MODULE_VERSION(geom_shsec, 0); Index: head/sys/geom/shsec/g_shsec.h =================================================================== --- head/sys/geom/shsec/g_shsec.h (revision 350693) +++ head/sys/geom/shsec/g_shsec.h (revision 350694) @@ -1,119 +1,101 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_SHSEC_H_ #define _G_SHSEC_H_ #include #define G_SHSEC_CLASS_NAME "SHSEC" #define G_SHSEC_MAGIC "GEOM::SHSEC" /* * Version history: * 0 - Initial version number. * 1 - Added md_provsize field to metadata. */ #define G_SHSEC_VERSION 1 #ifdef _KERNEL #define G_SHSEC_BFLAG_FIRST 0x1 -#define G_SHSEC_DEBUG(lvl, ...) do { \ - if (g_shsec_debug >= (lvl)) { \ - printf("GEOM_SHSEC"); \ - if (g_shsec_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_SHSEC_LOGREQ(lvl, bp, ...) do { \ - if (g_shsec_debug >= (lvl)) { \ - printf("GEOM_SHSEC"); \ - if (g_shsec_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_SHSEC_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_SHSEC", g_shsec_debug, (lvl), NULL, __VA_ARGS__) +#define G_SHSEC_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_SHSEC", g_shsec_debug, (lvl), (bp), __VA_ARGS__) struct g_shsec_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* device unique ID */ struct g_consumer **sc_disks; uint16_t sc_ndisks; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_shsec_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Stripe name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void shsec_metadata_encode(const struct g_shsec_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); bcopy(md->md_provider, data + 44, sizeof(md->md_provider)); le64enc(data + 60, md->md_provsize); } static __inline void shsec_metadata_decode(const u_char *data, struct g_shsec_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); bcopy(data + 44, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 60); } #endif /* _G_SHSEC_H_ */ Index: head/sys/geom/stripe/g_stripe.c =================================================================== --- head/sys/geom/stripe/g_stripe.c (revision 350693) +++ head/sys/geom/stripe/g_stripe.c (revision 350694) @@ -1,1274 +1,1275 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include FEATURE(geom_stripe, "GEOM striping support"); static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data"); static uma_zone_t g_stripe_zone; static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force); static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_stripe_taste; static g_ctl_req_t g_stripe_config; static g_dumpconf_t g_stripe_dumpconf; static g_init_t g_stripe_init; static g_fini_t g_stripe_fini; struct g_class g_stripe_class = { .name = G_STRIPE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_stripe_config, .taste = g_stripe_taste, .destroy_geom = g_stripe_destroy_geom, .init = g_stripe_init, .fini = g_stripe_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW, 0, "GEOM_STRIPE stuff"); static u_int g_stripe_debug = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0, "Debug level"); static int g_stripe_fast = 0; static int g_sysctl_stripe_fast(SYSCTL_HANDLER_ARGS) { int error, fast; fast = g_stripe_fast; error = sysctl_handle_int(oidp, &fast, 0, req); if (error == 0 && req->newptr != NULL) g_stripe_fast = fast; return (error); } SYSCTL_PROC(_kern_geom_stripe, OID_AUTO, fast, CTLTYPE_INT | CTLFLAG_RWTUN, NULL, 0, g_sysctl_stripe_fast, "I", "Fast, but memory-consuming, mode"); static u_int g_stripe_maxmem = MAXPHYS * 100; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_stripe_maxmem, 0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)"); static u_int g_stripe_fast_failed = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD, &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_stripe_init(struct g_class *mp __unused) { g_stripe_zone = uma_zcreate("g_stripe_zone", MAXPHYS, NULL, NULL, NULL, NULL, 0, 0); g_stripe_maxmem -= g_stripe_maxmem % MAXPHYS; uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / MAXPHYS); } static void g_stripe_fini(struct g_class *mp __unused) { uma_zdestroy(g_stripe_zone); } /* * Return the number of valid disks. */ static u_int g_stripe_nvalid(struct g_stripe_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_stripe_remove_disk(struct g_consumer *cp) { struct g_stripe_softc *sc; g_topology_assert(); KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_stripe_softc *)cp->geom->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); if (cp->private == NULL) { G_STRIPE_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); cp->private = (void *)(uintptr_t)-1; } if (sc->sc_provider != NULL) { G_STRIPE_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; sc->sc_disks[cp->index] = NULL; cp->index = 0; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_stripe_destroy(sc, 1); } static void g_stripe_orphan(struct g_consumer *cp) { struct g_stripe_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_stripe_remove_disk(cp); } static int g_stripe_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_stripe_softc *sc; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->private != NULL) { g_stripe_remove_disk(cp1); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset, off_t length, int mode) { off_t stripesize; size_t len; stripesize = sc->sc_stripesize; len = (size_t)(stripesize - (offset & (stripesize - 1))); do { bcopy(src, dst, len); if (mode) { dst += len + stripesize * (sc->sc_ndisks - 1); src += len; } else { dst += len; src += len + stripesize * (sc->sc_ndisks - 1); } length -= len; KASSERT(length >= 0, ("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).", (uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length)); if (length > stripesize) len = stripesize; else len = length; } while (length > 0); } static void g_stripe_done(struct bio *bp) { struct g_stripe_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) { g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset, bp->bio_length, 1); bp->bio_data = bp->bio_caller1; bp->bio_caller1 = NULL; } mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); if (pbp->bio_driver1 != NULL) uma_zfree(g_stripe_zone, pbp->bio_driver1); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static int g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; char *addr, *data = NULL; struct bio *cbp; off_t stripesize; u_int nparts = 0; int error; sc = bp->bio_to->geom->softc; addr = bp->bio_data; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; cbp->bio_length = length; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize, addr += stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } if (nparts >= sc->sc_ndisks) { cbp = TAILQ_NEXT(cbp, bio_queue); if (cbp == NULL) cbp = TAILQ_FIRST(&queue); nparts++; /* * Update bio structure. */ /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length += MIN(stripesize, length); if (cbp->bio_caller1 == NULL) { cbp->bio_caller1 = cbp->bio_data; cbp->bio_data = NULL; if (data == NULL) { data = uma_zalloc(g_stripe_zone, M_NOWAIT); if (data == NULL) { error = ENOMEM; goto failure; } } } } else { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); cbp->bio_caller2 = sc->sc_disks[no]; } } if (data != NULL) bp->bio_driver1 = data; /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; if (cbp->bio_caller1 != NULL) { cbp->bio_data = data; if (bp->bio_cmd == BIO_WRITE) { g_stripe_copy(sc, cbp->bio_caller1, data, cbp->bio_offset, cbp->bio_length, 0); } data += cbp->bio_length; } G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: if (data != NULL) uma_zfree(g_stripe_zone, data); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); if (cbp->bio_caller1 != NULL) { cbp->bio_data = cbp->bio_caller1; cbp->bio_caller1 = NULL; } bp->bio_children--; g_destroy_bio(cbp); } return (error); } static int g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; off_t stripesize; struct bio *cbp; char *addr; int error; sc = bp->bio_to->geom->softc; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ if (bp->bio_length == length) cbp->bio_done = g_std_done; /* Optimized lockless case. */ else cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; addr = NULL; } else addr = bp->bio_data; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); if (bp->bio_cmd != BIO_DELETE) addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd != BIO_DELETE) addr += stripesize; } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; g_destroy_bio(cbp); } return (error); } static void g_stripe_flush(struct g_stripe_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_stripe_done; cbp->bio_caller2 = sc->sc_disks[no]; cbp->bio_to = sc->sc_disks[no]->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_STRIPE_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; g_io_request(cbp, cp); } } static void g_stripe_start(struct bio *bp) { off_t offset, start, length, nstripe, stripesize; struct g_stripe_softc *sc; u_int no; int error, fast = 0; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_stripe_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_STRIPE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_stripe_flush(sc, bp); return; case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } stripesize = sc->sc_stripesize; /* * Calculations are quite messy, but fast I hope. */ /* Stripe number. */ /* nstripe = bp->bio_offset / stripesize; */ nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits; /* Disk number. */ no = nstripe % sc->sc_ndisks; /* Start position in stripe. */ /* start = bp->bio_offset % stripesize; */ start = bp->bio_offset & (stripesize - 1); /* Start position in disk. */ /* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */ offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start; /* Length of data to operate. */ length = MIN(bp->bio_length, stripesize - start); /* * Do use "fast" mode when: * 1. "Fast" mode is ON. * and * 2. Request size is less than or equal to MAXPHYS, * which should always be true. * and * 3. Request size is bigger than stripesize * ndisks. If it isn't, * there will be no need to send more than one I/O request to * a provider, so there is nothing to optmize. * and * 4. Request is not unmapped. * and * 5. It is not a BIO_DELETE. */ if (g_stripe_fast && bp->bio_length <= MAXPHYS && bp->bio_length >= stripesize * sc->sc_ndisks && (bp->bio_flags & BIO_UNMAPPED) == 0 && bp->bio_cmd != BIO_DELETE) { fast = 1; } error = 0; if (fast) { error = g_stripe_start_fast(bp, no, offset, length); if (error != 0) g_stripe_fast_failed++; } /* * Do use "economic" when: * 1. "Economic" mode is ON. * or * 2. "Fast" mode failed. It can only fail if there is no memory. */ if (!fast || error != 0) error = g_stripe_start_economic(bp, no, offset, length); if (error != 0) { if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } } static void g_stripe_check_and_run(struct g_stripe_softc *sc) { struct g_provider *dp; off_t mediasize, ms; u_int no, sectorsize = 0; g_topology_assert(); if (g_stripe_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s", sc->sc_name); sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (g_stripe_fast == 0) sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED; /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) mediasize -= sc->sc_disks[0]->provider->sectorsize; mediasize -= mediasize % sc->sc_stripesize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { dp = sc->sc_disks[no]->provider; ms = dp->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) ms -= dp->sectorsize; ms -= ms % sc->sc_stripesize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_STRIPE_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED; } } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize * sc->sc_ndisks; sc->sc_provider->stripesize = sc->sc_stripesize; sc->sc_provider->stripeoffset = 0; g_error_provider(sc->sc_provider, 0); G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ stripe_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = no; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) { struct g_stripe_metadata md; /* Reread metadata. */ error = g_stripe_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } sc->sc_disks[no] = cp; G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_stripe_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md, u_int type) { struct g_stripe_softc *sc; struct g_geom *gp; u_int no; g_topology_assert(); G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } #if 0 /* Stripe size have to be grater than or equal to sector size. */ if (md->md_stripesize < sectorsize) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } #endif /* Stripe size have to be power of 2. */ if (!powerof2(md->md_stripesize)) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_STRIPE_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO); gp->start = g_stripe_start; gp->spoiled = g_stripe_orphan; gp->orphan = g_stripe_orphan; gp->access = g_stripe_access; gp->dumpconf = g_stripe_dumpconf; sc->sc_id = md->md_id; sc->sc_stripesize = md->md_stripesize; sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1); sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_STRIPE, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_STRIPE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_STRIPE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_stripe_remove_disk(cp); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_STRIPE); mtx_destroy(&sc->sc_lock); free(sc, M_STRIPE); G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_stripe_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_stripe_softc *sc; sc = gp->softc; return (g_stripe_destroy(sc, 0)); } static struct g_geom * g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_stripe_metadata md; struct g_stripe_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_STRIPE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "stripe:taste"); gp->start = g_stripe_start; gp->access = g_stripe_access; gp->orphan = g_stripe_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_stripe_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0) return (NULL); if (md.md_version > G_STRIPE_VERSION) { printf("geom_stripe.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 2) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 3) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC); if (gp == NULL) { G_STRIPE_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_stripe_destroy(sc, 1); return (NULL); } } return (gp); } static void g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_stripe_metadata md; struct g_provider *pp; struct g_stripe_softc *sc; struct g_geom *gp; struct sbuf *sb; off_t *stripesize; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic)); md.md_version = G_STRIPE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument.", "stripesize"); return; } md.md_stripesize = (uint32_t)*stripesize; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_STRIPE_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); return; } } gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); continue; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); KASSERT(pp != NULL, ("Provider %s disappear?!", name)); if (g_stripe_add_disk(sc, pp, no - 1) != 0) { G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_stripe_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_stripe_softc * g_stripe_find_device(struct g_class *mp, const char *name) { struct g_stripe_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_stripe_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_stripe_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_stripe_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_STRIPE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_stripe_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_stripe_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_stripe_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_stripesize); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_STRIPE_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_STRIPE_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_stripe_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_stripe_class, g_stripe); MODULE_VERSION(geom_stripe, 0); Index: head/sys/geom/stripe/g_stripe.h =================================================================== --- head/sys/geom/stripe/g_stripe.h (revision 350693) +++ head/sys/geom/stripe/g_stripe.h (revision 350694) @@ -1,126 +1,111 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_STRIPE_H_ #define _G_STRIPE_H_ #include #define G_STRIPE_CLASS_NAME "STRIPE" #define G_STRIPE_MAGIC "GEOM::STRIPE" /* * Version history: * 0 - Initial version number. * 1 - Added 'stop' command for gstripe(8). * 2 - Added md_provider field to metadata and '-h' option for gstripe(8). * 3 - Added md_provsize field to metadata. */ #define G_STRIPE_VERSION 3 #ifdef _KERNEL #define G_STRIPE_TYPE_MANUAL 0 #define G_STRIPE_TYPE_AUTOMATIC 1 -#define G_STRIPE_DEBUG(lvl, ...) do { \ - if (g_stripe_debug >= (lvl)) { \ - printf("GEOM_STRIPE"); \ - if (g_stripe_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) -#define G_STRIPE_LOGREQ(bp, ...) do { \ - if (g_stripe_debug >= 2) { \ - printf("GEOM_STRIPE[2]: "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_STRIPE_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_STRIPE", g_stripe_debug, (lvl), NULL, __VA_ARGS__) +#define G_STRIPE_LOGREQ(bp, ...) \ + _GEOM_DEBUG("GEOM_STRIPE", g_stripe_debug, 2, (bp), __VA_ARGS__) struct g_stripe_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* stripe unique ID */ struct g_consumer **sc_disks; uint16_t sc_ndisks; off_t sc_stripesize; uint32_t sc_stripebits; struct mtx sc_lock; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_stripe_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Stripe name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ uint32_t md_stripesize; /* Stripe size. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void stripe_metadata_encode(const struct g_stripe_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); le32enc(data + 44, md->md_stripesize); bcopy(md->md_provider, data + 48, sizeof(md->md_provider)); le64enc(data + 64, md->md_provsize); } static __inline void stripe_metadata_decode(const u_char *data, struct g_stripe_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_stripesize = le32dec(data + 44); bcopy(data + 48, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 64); } #endif /* _G_STRIPE_H_ */ Index: head/sys/geom/vinum/geom_vinum.c =================================================================== --- head/sys/geom/vinum/geom_vinum.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum.c (revision 350694) @@ -1,1051 +1,1052 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, vinum, CTLFLAG_RW, 0, "GEOM_VINUM stuff"); u_int g_vinum_debug = 0; SYSCTL_UINT(_kern_geom_vinum, OID_AUTO, debug, CTLFLAG_RWTUN, &g_vinum_debug, 0, "Debug level"); static int gv_create(struct g_geom *, struct gctl_req *); static void gv_attach(struct gv_softc *, struct gctl_req *); static void gv_detach(struct gv_softc *, struct gctl_req *); static void gv_parityop(struct gv_softc *, struct gctl_req *); static void gv_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d; g_topology_assert(); KASSERT(cp != NULL, ("gv_orphan: null cp")); gp = cp->geom; KASSERT(gp != NULL, ("gv_orphan: null gp")); sc = gp->softc; KASSERT(sc != NULL, ("gv_orphan: null sc")); d = cp->private; KASSERT(d != NULL, ("gv_orphan: null d")); g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); } void gv_start(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; gp = bp->bio_to->geom; sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } void gv_done(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; KASSERT(bp != NULL, ("NULL bp")); gp = bp->bio_from->geom; sc = gp->softc; mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_up, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d, *d2; int error; gp = pp->geom; sc = gp->softc; /* * We want to modify the read count with the write count in case we have * plexes in a RAID-5 organization. */ dr += dw; LIST_FOREACH(d, &sc->drives, drive) { if (d->consumer == NULL) continue; error = g_access(d->consumer, dr, dw, de); if (error) { LIST_FOREACH(d2, &sc->drives, drive) { if (d == d2) break; g_access(d2->consumer, -dr, -dw, -de); } G_VINUM_DEBUG(0, "g_access '%s' failed: %d", d->name, error); return (error); } } return (0); } static void gv_init(struct g_class *mp) { struct g_geom *gp; struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_init(%p)", mp); gp = g_new_geomf(mp, "VINUM"); gp->spoiled = gv_orphan; gp->orphan = gv_orphan; gp->access = gv_access; gp->start = gv_start; gp->softc = g_malloc(sizeof(struct gv_softc), M_WAITOK | M_ZERO); sc = gp->softc; sc->geom = gp; sc->bqueue_down = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); sc->bqueue_up = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(sc->bqueue_down); bioq_init(sc->bqueue_up); LIST_INIT(&sc->drives); LIST_INIT(&sc->subdisks); LIST_INIT(&sc->plexes); LIST_INIT(&sc->volumes); TAILQ_INIT(&sc->equeue); mtx_init(&sc->config_mtx, "gv_config", NULL, MTX_DEF); mtx_init(&sc->equeue_mtx, "gv_equeue", NULL, MTX_DEF); mtx_init(&sc->bqueue_mtx, "gv_bqueue", NULL, MTX_DEF); kproc_create(gv_worker, sc, &sc->worker, 0, 0, "gv_worker"); } static int gv_unload(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_unload(%p)", mp); g_topology_assert(); sc = gp->softc; if (sc != NULL) { gv_worker_exit(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } return (0); } /* Handle userland request of attaching object. */ static void gv_attach(struct gv_softc *sc, struct gctl_req *req) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; off_t *offset; int *rename, type_child, type_parent; char *child, *parent; child = gctl_get_param(req, "child", NULL); if (child == NULL) { gctl_error(req, "no child given"); return; } parent = gctl_get_param(req, "parent", NULL); if (parent == NULL) { gctl_error(req, "no parent given"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "no offset given"); return; } rename = gctl_get_paraml(req, "rename", sizeof(*rename)); if (rename == NULL) { gctl_error(req, "no rename flag given"); return; } type_child = gv_object_type(sc, child); type_parent = gv_object_type(sc, parent); switch (type_child) { case GV_TYPE_PLEX: if (type_parent != GV_TYPE_VOL) { gctl_error(req, "no such volume to attach to"); return; } v = gv_find_vol(sc, parent); p = gv_find_plex(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_PLEX, p, v, *offset, *rename); break; case GV_TYPE_SD: if (type_parent != GV_TYPE_PLEX) { gctl_error(req, "no such plex to attach to"); return; } p = gv_find_plex(sc, parent); s = gv_find_sd(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_SD, s, p, *offset, *rename); break; default: gctl_error(req, "invalid child type"); break; } } /* Handle userland request of detaching object. */ static void gv_detach(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; struct gv_sd *s; int *flags, type; char *object; object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no argument given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); type = gv_object_type(sc, object); switch (type) { case GV_TYPE_PLEX: p = gv_find_plex(sc, object); gv_post_event(sc, GV_EVENT_DETACH_PLEX, p, NULL, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); gv_post_event(sc, GV_EVENT_DETACH_SD, s, NULL, *flags, 0); break; default: gctl_error(req, "invalid object type"); break; } } /* Handle userland requests for creating new objects. */ static int gv_create(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d, *d2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_volume *v, *v2; struct g_provider *pp; int error, i, *drives, *flags, *plexes, *subdisks, *volumes; char buf[20]; g_topology_assert(); sc = gp->softc; /* Find out how many of each object have been passed in. */ volumes = gctl_get_paraml(req, "volumes", sizeof(*volumes)); plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (volumes == NULL || plexes == NULL || subdisks == NULL || drives == NULL) { gctl_error(req, "number of objects not given"); return (-1); } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "flags not given"); return (-1); } /* First, handle drive definitions ... */ for (i = 0; i < *drives; i++) { snprintf(buf, sizeof(buf), "drive%d", i); d2 = gctl_get_paraml(req, buf, sizeof(*d2)); if (d2 == NULL) { gctl_error(req, "no drive definition given"); return (-1); } /* * Make sure that the device specified in the drive config is * an active GEOM provider. */ pp = g_provider_by_name(d2->device); if (pp == NULL) { gctl_error(req, "%s: device not found", d2->device); goto error; } if (gv_find_drive(sc, d2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "drive '%s' already exists", d2->name); goto error; } if (gv_find_drive_device(sc, d2->device) != NULL) { gctl_error(req, "device '%s' already configured in " "gvinum", d2->device); goto error; } d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); bcopy(d2, d, sizeof(*d)); gv_post_event(sc, GV_EVENT_CREATE_DRIVE, d, NULL, 0, 0); } /* ... then volume definitions ... */ for (i = 0; i < *volumes; i++) { error = 0; snprintf(buf, sizeof(buf), "volume%d", i); v2 = gctl_get_paraml(req, buf, sizeof(*v2)); if (v2 == NULL) { gctl_error(req, "no volume definition given"); return (-1); } if (gv_find_vol(sc, v2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "volume '%s' already exists", v2->name); goto error; } v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); bcopy(v2, v, sizeof(*v)); gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); } /* ... then plex definitions ... */ for (i = 0; i < *plexes; i++) { error = 0; snprintf(buf, sizeof(buf), "plex%d", i); p2 = gctl_get_paraml(req, buf, sizeof(*p2)); if (p2 == NULL) { gctl_error(req, "no plex definition given"); return (-1); } if (gv_find_plex(sc, p2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "plex '%s' already exists", p2->name); goto error; } p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); bcopy(p2, p, sizeof(*p)); gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); } /* ... and, finally, subdisk definitions. */ for (i = 0; i < *subdisks; i++) { error = 0; snprintf(buf, sizeof(buf), "sd%d", i); s2 = gctl_get_paraml(req, buf, sizeof(*s2)); if (s2 == NULL) { gctl_error(req, "no subdisk definition given"); return (-1); } if (gv_find_sd(sc, s2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "sd '%s' already exists", s2->name); goto error; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); bcopy(s2, s, sizeof(*s)); gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } error: gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } static void gv_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct gv_softc *sc; struct sbuf *sb; char *comment; g_topology_assert(); gp = LIST_FIRST(&mp->geom); sc = gp->softc; if (!strcmp(verb, "attach")) { gv_attach(sc, req); } else if (!strcmp(verb, "concat")) { gv_concat(gp, req); } else if (!strcmp(verb, "detach")) { gv_detach(sc, req); } else if (!strcmp(verb, "list")) { gv_list(gp, req); /* Save our configuration back to disk. */ } else if (!strcmp(verb, "saveconfig")) { gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Return configuration in string form. */ } else if (!strcmp(verb, "getconfig")) { comment = gctl_get_param(req, "comment", NULL); if (comment == NULL) { gctl_error(req, "no comment parameter given"); return; } sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 0, comment); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } else if (!strcmp(verb, "create")) { gv_create(gp, req); } else if (!strcmp(verb, "mirror")) { gv_mirror(gp, req); } else if (!strcmp(verb, "move")) { gv_move(gp, req); } else if (!strcmp(verb, "raid5")) { gv_raid5(gp, req); } else if (!strcmp(verb, "rebuildparity") || !strcmp(verb, "checkparity")) { gv_parityop(sc, req); } else if (!strcmp(verb, "remove")) { gv_remove(gp, req); } else if (!strcmp(verb, "rename")) { gv_rename(gp, req); } else if (!strcmp(verb, "resetconfig")) { gv_post_event(sc, GV_EVENT_RESET_CONFIG, sc, NULL, 0, 0); } else if (!strcmp(verb, "start")) { gv_start_obj(gp, req); } else if (!strcmp(verb, "stripe")) { gv_stripe(gp, req); } else if (!strcmp(verb, "setstate")) { gv_setstate(gp, req); } else gctl_error(req, "Unknown verb parameter"); } static void gv_parityop(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; int *flags, *rebuild, type; char *plex; plex = gctl_get_param(req, "plex", NULL); if (plex == NULL) { gctl_error(req, "no plex given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); if (rebuild == NULL) { gctl_error(req, "no operation given"); return; } type = gv_object_type(sc, plex); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", plex); return; } p = gv_find_plex(sc, plex); if (p->state != GV_PLEX_UP) { gctl_error(req, "plex %s is not completely accessible", p->name); return; } if (p->org != GV_PLEX_RAID5) { gctl_error(req, "plex %s is not a RAID5 plex", p->name); return; } /* Put it in the event queue. */ /* XXX: The state of the plex might have changed when this event is * picked up ... We should perhaps check this afterwards. */ if (*rebuild) gv_post_event(sc, GV_EVENT_PARITY_REBUILD, p, NULL, 0, 0); else gv_post_event(sc, GV_EVENT_PARITY_CHECK, p, NULL, 0, 0); } static struct g_geom * gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct gv_softc *sc; struct gv_hdr vhdr; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); gp = LIST_FIRST(&mp->geom); if (gp == NULL) { G_VINUM_DEBUG(0, "error: tasting, but not initialized?"); return (NULL); } sc = gp->softc; cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } g_topology_unlock(); error = gv_read_header(cp, &vhdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); /* Check if what we've been given is a valid vinum drive. */ if (!error) gv_post_event(sc, GV_EVENT_DRIVE_TASTED, pp, NULL, 0, 0); return (NULL); } void gv_worker(void *arg) { struct g_provider *pp; struct gv_softc *sc; struct gv_event *ev; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; struct bio *bp; int newstate, flags, err, rename; char *newname; off_t offset; sc = arg; KASSERT(sc != NULL, ("NULL sc")); for (;;) { /* Look at the events first... */ ev = gv_get_event(sc); if (ev != NULL) { gv_remove_event(sc, ev); switch (ev->type) { case GV_EVENT_DRIVE_TASTED: G_VINUM_DEBUG(2, "event 'drive tasted'"); pp = ev->arg1; gv_drive_tasted(sc, pp); break; case GV_EVENT_DRIVE_LOST: G_VINUM_DEBUG(2, "event 'drive lost'"); d = ev->arg1; gv_drive_lost(sc, d); break; case GV_EVENT_CREATE_DRIVE: G_VINUM_DEBUG(2, "event 'create drive'"); d = ev->arg1; gv_create_drive(sc, d); break; case GV_EVENT_CREATE_VOLUME: G_VINUM_DEBUG(2, "event 'create volume'"); v = ev->arg1; gv_create_volume(sc, v); break; case GV_EVENT_CREATE_PLEX: G_VINUM_DEBUG(2, "event 'create plex'"); p = ev->arg1; gv_create_plex(sc, p); break; case GV_EVENT_CREATE_SD: G_VINUM_DEBUG(2, "event 'create sd'"); s = ev->arg1; gv_create_sd(sc, s); break; case GV_EVENT_RM_DRIVE: G_VINUM_DEBUG(2, "event 'remove drive'"); d = ev->arg1; flags = ev->arg3; gv_rm_drive(sc, d, flags); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_VOLUME: G_VINUM_DEBUG(2, "event 'remove volume'"); v = ev->arg1; gv_rm_vol(sc, v); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_PLEX: G_VINUM_DEBUG(2, "event 'remove plex'"); p = ev->arg1; gv_rm_plex(sc, p); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_SD: G_VINUM_DEBUG(2, "event 'remove sd'"); s = ev->arg1; gv_rm_sd(sc, s); /*gv_setup_objects(sc);*/ break; case GV_EVENT_SAVE_CONFIG: G_VINUM_DEBUG(2, "event 'save config'"); gv_save_config(sc); break; case GV_EVENT_SET_SD_STATE: G_VINUM_DEBUG(2, "event 'setstate sd'"); s = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_sd_state(s, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting subdisk" " state: error code %d", err); break; case GV_EVENT_SET_DRIVE_STATE: G_VINUM_DEBUG(2, "event 'setstate drive'"); d = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_drive_state(d, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting drive " "state: error code %d", err); break; case GV_EVENT_SET_VOL_STATE: G_VINUM_DEBUG(2, "event 'setstate volume'"); v = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_vol_state(v, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting volume " "state: error code %d", err); break; case GV_EVENT_SET_PLEX_STATE: G_VINUM_DEBUG(2, "event 'setstate plex'"); p = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_plex_state(p, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting plex " "state: error code %d", err); break; case GV_EVENT_SETUP_OBJECTS: G_VINUM_DEBUG(2, "event 'setup objects'"); gv_setup_objects(sc); break; case GV_EVENT_RESET_CONFIG: G_VINUM_DEBUG(2, "event 'resetconfig'"); err = gv_resetconfig(sc); if (err) G_VINUM_DEBUG(0, "error resetting " "config: error code %d", err); break; case GV_EVENT_PARITY_REBUILD: /* * Start the rebuild. The gv_plex_done will * handle issuing of the remaining rebuild bio's * until it's finished. */ G_VINUM_DEBUG(2, "event 'rebuild'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; p->flags |= GV_PLEX_REBUILDING; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK | GV_BIO_PARITY, 0); break; case GV_EVENT_PARITY_CHECK: /* Start parity check. */ G_VINUM_DEBUG(2, "event 'check'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK, 0); break; case GV_EVENT_START_PLEX: G_VINUM_DEBUG(2, "event 'start' plex"); p = ev->arg1; gv_start_plex(p); break; case GV_EVENT_START_VOLUME: G_VINUM_DEBUG(2, "event 'start' volume"); v = ev->arg1; gv_start_vol(v); break; case GV_EVENT_ATTACH_PLEX: G_VINUM_DEBUG(2, "event 'attach' plex"); p = ev->arg1; v = ev->arg2; rename = ev->arg4; err = gv_attach_plex(p, v, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", p->name, v->name, err); break; case GV_EVENT_ATTACH_SD: G_VINUM_DEBUG(2, "event 'attach' sd"); s = ev->arg1; p = ev->arg2; offset = ev->arg3; rename = ev->arg4; err = gv_attach_sd(s, p, offset, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", s->name, p->name, err); break; case GV_EVENT_DETACH_PLEX: G_VINUM_DEBUG(2, "event 'detach' plex"); p = ev->arg1; flags = ev->arg3; err = gv_detach_plex(p, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", p->name, err); break; case GV_EVENT_DETACH_SD: G_VINUM_DEBUG(2, "event 'detach' sd"); s = ev->arg1; flags = ev->arg3; err = gv_detach_sd(s, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", s->name, err); break; case GV_EVENT_RENAME_VOL: G_VINUM_DEBUG(2, "event 'rename' volume"); v = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_vol(sc, v, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", v->name, newname, err); g_free(newname); /* Destroy and recreate the provider if we can. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(0, "unable to rename " "provider to %s: provider in use", v->name); break; } g_topology_lock(); g_wither_provider(v->provider, ENOENT); g_topology_unlock(); v->provider = NULL; gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); break; case GV_EVENT_RENAME_PLEX: G_VINUM_DEBUG(2, "event 'rename' plex"); p = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_plex(sc, p, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", p->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_SD: G_VINUM_DEBUG(2, "event 'rename' sd"); s = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_sd(sc, s, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", s->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_DRIVE: G_VINUM_DEBUG(2, "event 'rename' drive"); d = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_drive(sc, d, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", d->name, newname, err); g_free(newname); break; case GV_EVENT_MOVE_SD: G_VINUM_DEBUG(2, "event 'move' sd"); s = ev->arg1; d = ev->arg2; flags = ev->arg3; err = gv_move_sd(sc, s, d, flags); if (err) G_VINUM_DEBUG(0, "error moving %s to " "%s: error code %d", s->name, d->name, err); break; case GV_EVENT_THREAD_EXIT: G_VINUM_DEBUG(2, "event 'thread exit'"); g_free(ev); mtx_lock(&sc->equeue_mtx); mtx_lock(&sc->bqueue_mtx); gv_cleanup(sc); mtx_destroy(&sc->bqueue_mtx); mtx_destroy(&sc->equeue_mtx); g_free(sc->bqueue_down); g_free(sc->bqueue_up); g_free(sc); kproc_exit(0); /* NOTREACHED */ default: G_VINUM_DEBUG(1, "unknown event %d", ev->type); } g_free(ev); continue; } /* ... then do I/O processing. */ mtx_lock(&sc->bqueue_mtx); /* First do new requests. */ bp = bioq_takefirst(sc->bqueue_down); if (bp != NULL) { mtx_unlock(&sc->bqueue_mtx); /* A bio that interfered with another bio. */ if (bp->bio_pflags & GV_BIO_ONHOLD) { s = bp->bio_caller1; p = s->plex_sc; /* Is it still locked out? */ if (gv_stripe_active(p, bp)) { /* Park the bio on the waiting queue. */ bioq_disksort(p->wqueue, bp); } else { bp->bio_pflags &= ~GV_BIO_ONHOLD; g_io_request(bp, s->drive_sc->consumer); } /* A special request requireing special handling. */ } else if (bp->bio_pflags & GV_BIO_INTERNAL) { p = bp->bio_caller1; gv_plex_start(p, bp); } else { gv_volume_start(sc, bp); } mtx_lock(&sc->bqueue_mtx); } /* Then do completed requests. */ bp = bioq_takefirst(sc->bqueue_up); if (bp == NULL) { msleep(sc, &sc->bqueue_mtx, PRIBIO, "-", hz/10); mtx_unlock(&sc->bqueue_mtx); continue; } mtx_unlock(&sc->bqueue_mtx); gv_bio_done(sc, bp); } } #define VINUM_CLASS_NAME "VINUM" static struct g_class g_vinum_class = { .name = VINUM_CLASS_NAME, .version = G_VERSION, .init = gv_init, .taste = gv_taste, .ctlreq = gv_config, .destroy_geom = gv_unload, }; DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); MODULE_VERSION(geom_vinum, 0); Index: head/sys/geom/vinum/geom_vinum.h =================================================================== --- head/sys/geom/vinum/geom_vinum.h (revision 350693) +++ head/sys/geom/vinum/geom_vinum.h (revision 350694) @@ -1,184 +1,165 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_H_ #define _GEOM_VINUM_H_ /* geom_vinum_create.c */ void gv_concat(struct g_geom *gp, struct gctl_req *); void gv_mirror(struct g_geom *gp, struct gctl_req *); void gv_stripe(struct g_geom *gp, struct gctl_req *); void gv_raid5(struct g_geom *gp, struct gctl_req *); int gv_create_drive(struct gv_softc *, struct gv_drive *); int gv_create_volume(struct gv_softc *, struct gv_volume *); int gv_create_plex(struct gv_softc *, struct gv_plex *); int gv_create_sd(struct gv_softc *, struct gv_sd *); /* geom_vinum_drive.c */ void gv_save_config(struct gv_softc *); int gv_read_header(struct g_consumer *, struct gv_hdr *); int gv_write_header(struct g_consumer *, struct gv_hdr *); /* geom_vinum_init.c */ void gv_start_obj(struct g_geom *, struct gctl_req *); int gv_start_plex(struct gv_plex *); int gv_start_vol(struct gv_volume *); /* geom_vinum_list.c */ void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lp(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_ls(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lv(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_list(struct g_geom *, struct gctl_req *); /* geom_vinum_move.c */ void gv_move(struct g_geom *, struct gctl_req *); int gv_move_sd(struct gv_softc *, struct gv_sd *, struct gv_drive *, int); /* geom_vinum_rename.c */ void gv_rename(struct g_geom *, struct gctl_req *); int gv_rename_drive(struct gv_softc *, struct gv_drive *, char *, int); int gv_rename_plex(struct gv_softc *, struct gv_plex *, char *, int); int gv_rename_sd(struct gv_softc *, struct gv_sd *, char *, int); int gv_rename_vol(struct gv_softc *, struct gv_volume *, char *, int); /* geom_vinum_rm.c */ void gv_remove(struct g_geom *, struct gctl_req *); int gv_resetconfig(struct gv_softc *); void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s); void gv_rm_drive(struct gv_softc *, struct gv_drive *, int); void gv_rm_plex(struct gv_softc *, struct gv_plex *); void gv_rm_vol(struct gv_softc *, struct gv_volume *); /* geom_vinum_state.c */ int gv_sdstatemap(struct gv_plex *); void gv_setstate(struct g_geom *, struct gctl_req *); int gv_set_drive_state(struct gv_drive *, int, int); int gv_set_sd_state(struct gv_sd *, int, int); int gv_set_vol_state(struct gv_volume *, int, int); int gv_set_plex_state(struct gv_plex *, int, int); void gv_update_sd_state(struct gv_sd *); void gv_update_plex_state(struct gv_plex *); void gv_update_vol_state(struct gv_volume *); /* geom_vinum_subr.c */ void gv_adjust_freespace(struct gv_sd *, off_t); void gv_free_sd(struct gv_sd *); struct gv_drive *gv_find_drive(struct gv_softc *, char *); struct gv_drive *gv_find_drive_device(struct gv_softc *, char *); struct gv_plex *gv_find_plex(struct gv_softc *, char *); struct gv_sd *gv_find_sd(struct gv_softc *, char *); struct gv_volume *gv_find_vol(struct gv_softc *, char *); void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); int gv_is_striped(struct gv_plex *); int gv_consumer_is_open(struct g_consumer *); int gv_provider_is_open(struct g_provider *); int gv_object_type(struct gv_softc *, char *); void gv_parse_config(struct gv_softc *, char *, struct gv_drive *); int gv_sd_to_drive(struct gv_sd *, struct gv_drive *); int gv_sd_to_plex(struct gv_sd *, struct gv_plex *); int gv_sdcount(struct gv_plex *, int); void gv_update_plex_config(struct gv_plex *); void gv_update_vol_size(struct gv_volume *, off_t); off_t gv_vol_size(struct gv_volume *); off_t gv_plex_size(struct gv_plex *); int gv_plexdown(struct gv_volume *); int gv_attach_plex(struct gv_plex *, struct gv_volume *, int); int gv_attach_sd(struct gv_sd *, struct gv_plex *, off_t, int); int gv_detach_plex(struct gv_plex *, int); int gv_detach_sd(struct gv_sd *, int); /* geom_vinum.c */ void gv_worker(void *); void gv_post_event(struct gv_softc *, int, void *, void *, intmax_t, intmax_t); void gv_worker_exit(struct gv_softc *); struct gv_event *gv_get_event(struct gv_softc *); void gv_remove_event(struct gv_softc *, struct gv_event *); void gv_drive_tasted(struct gv_softc *, struct g_provider *); void gv_drive_lost(struct gv_softc *, struct gv_drive *); void gv_setup_objects(struct gv_softc *); void gv_start(struct bio *); int gv_access(struct g_provider *, int, int, int); void gv_cleanup(struct gv_softc *); /* geom_vinum_volume.c */ void gv_done(struct bio *); void gv_volume_start(struct gv_softc *, struct bio *); void gv_volume_flush(struct gv_volume *); void gv_bio_done(struct gv_softc *, struct bio *); /* geom_vinum_plex.c */ void gv_plex_start(struct gv_plex *, struct bio *); void gv_plex_raid5_done(struct gv_plex *, struct bio *); void gv_plex_normal_done(struct gv_plex *, struct bio *); int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); void gv_grow_complete(struct gv_plex *, struct bio *); void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); void gv_init_complete(struct gv_plex *, struct bio *); void gv_parity_request(struct gv_plex *, int, off_t); void gv_parity_complete(struct gv_plex *, struct bio *); void gv_rebuild_complete(struct gv_plex *, struct bio *); int gv_sync_request(struct gv_plex *, struct gv_plex *, off_t, off_t, int, caddr_t); int gv_sync_complete(struct gv_plex *, struct bio *); extern u_int g_vinum_debug; -#define G_VINUM_DEBUG(lvl, ...) do { \ - if (g_vinum_debug >= (lvl)) { \ - printf("GEOM_VINUM"); \ - if (g_vinum_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) - -#define G_VINUM_LOGREQ(lvl, bp, ...) do { \ - if (g_vinum_debug >= (lvl)) { \ - printf("GEOM_VINUM"); \ - if (g_vinum_debug > 0) \ - printf("[%u]", lvl); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define G_VINUM_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_VINUM", g_vinum_debug, (lvl), NULL, __VA_ARGS__) +#define G_VINUM_LOGREQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_VINUM", g_vinum_debug, (lvl), (bp), __VA_ARGS__) #endif /* !_GEOM_VINUM_H_ */ Index: head/sys/geom/vinum/geom_vinum_create.c =================================================================== --- head/sys/geom/vinum/geom_vinum_create.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_create.c (revision 350694) @@ -1,612 +1,613 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include +#include #include #include #define DEFAULT_STRIPESIZE 262144 /* * Create a new drive object, either by user request, during taste of the drive * itself, or because it was referenced by a subdisk during taste. */ int gv_create_drive(struct gv_softc *sc, struct gv_drive *d) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; struct gv_drive *d2; struct gv_hdr *hdr; struct gv_freelist *fl; KASSERT(d != NULL, ("gv_create_drive: NULL d")); gp = sc->geom; pp = NULL; cp = cp2 = NULL; /* The drive already has a consumer if it was tasted before. */ if (d->consumer != NULL) { cp = d->consumer; cp->private = d; pp = cp->provider; } else if (!(d->flags & GV_DRIVE_REFERENCED)) { if (gv_find_drive(sc, d->name) != NULL) { G_VINUM_DEBUG(0, "drive '%s' already exists", d->name); g_free(d); return (GV_ERR_CREATE); } if (gv_find_drive_device(sc, d->device) != NULL) { G_VINUM_DEBUG(0, "provider '%s' already in use by " "gvinum", d->device); return (GV_ERR_CREATE); } pp = g_provider_by_name(d->device); if (pp == NULL) { G_VINUM_DEBUG(0, "create '%s': device '%s' disappeared", d->name, d->device); g_free(d); return (GV_ERR_CREATE); } g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to attach", d->name); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); d->consumer = cp; cp->private = d; } /* * If this was just a "referenced" drive, we're almost finished, but * insert this drive not on the head of the drives list, as * gv_drive_is_newer() expects a "real" drive from LIST_FIRST(). */ if (d->flags & GV_DRIVE_REFERENCED) { snprintf(d->device, sizeof(d->device), "???"); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); return (0); } /* * Update access counts of the new drive to those of an already * existing drive. */ LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->consumer == NULL)) continue; cp2 = d2->consumer; g_topology_lock(); if ((cp2->acr || cp2->acw || cp2->ace) && (g_access(cp, cp2->acr, cp2->acw, cp2->ace) != 0)) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to update " "access counts", d->name); if (d->hdr != NULL) g_free(d->hdr); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); break; } d->size = pp->mediasize - GV_DATA_START; d->avail = d->size; d->vinumconf = sc; LIST_INIT(&d->subdisks); LIST_INIT(&d->freelist); /* The header might have been set during taste. */ if (d->hdr == NULL) { hdr = g_malloc(sizeof(*hdr), M_WAITOK | M_ZERO); hdr->magic = GV_MAGIC; hdr->config_length = GV_CFG_LEN; getcredhostname(NULL, hdr->label.sysname, GV_HOSTNAME_LEN); strlcpy(hdr->label.name, d->name, sizeof(hdr->label.name)); microtime(&hdr->label.date_of_birth); d->hdr = hdr; } /* We also need a freelist entry. */ fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); fl->offset = GV_DATA_START; fl->size = d->avail; LIST_INSERT_HEAD(&d->freelist, fl, freelist); d->freelist_entries = 1; if (gv_find_drive(sc, d->name) == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); gv_set_drive_state(d, GV_DRIVE_UP, 0); return (0); } int gv_create_volume(struct gv_softc *sc, struct gv_volume *v) { KASSERT(v != NULL, ("gv_create_volume: NULL v")); v->vinumconf = sc; v->flags |= GV_VOL_NEWBORN; LIST_INIT(&v->plexes); LIST_INSERT_HEAD(&sc->volumes, v, volume); v->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(v->wqueue); return (0); } int gv_create_plex(struct gv_softc *sc, struct gv_plex *p) { struct gv_volume *v; KASSERT(p != NULL, ("gv_create_plex: NULL p")); /* Find the volume this plex should be attached to. */ v = gv_find_vol(sc, p->volume); if (v == NULL) { G_VINUM_DEBUG(0, "create plex '%s': volume '%s' not found", p->name, p->volume); g_free(p); return (GV_ERR_CREATE); } if (!(v->flags & GV_VOL_NEWBORN)) p->flags |= GV_PLEX_ADDED; p->vol_sc = v; v->plexcount++; p->vinumconf = sc; p->synced = 0; p->flags |= GV_PLEX_NEWBORN; LIST_INSERT_HEAD(&v->plexes, p, in_volume); LIST_INIT(&p->subdisks); TAILQ_INIT(&p->packets); LIST_INSERT_HEAD(&sc->plexes, p, plex); p->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->bqueue); p->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->wqueue); p->rqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->rqueue); return (0); } int gv_create_sd(struct gv_softc *sc, struct gv_sd *s) { struct gv_plex *p; struct gv_drive *d; KASSERT(s != NULL, ("gv_create_sd: NULL s")); /* Find the drive where this subdisk should be put on. */ d = gv_find_drive(sc, s->drive); if (d == NULL) { /* * It's possible that the subdisk references a drive that * doesn't exist yet (during the taste process), so create a * practically empty "referenced" drive. */ if (s->flags & GV_SD_TASTED) { d = g_malloc(sizeof(struct gv_drive), M_WAITOK | M_ZERO); d->flags |= GV_DRIVE_REFERENCED; strlcpy(d->name, s->drive, sizeof(d->name)); gv_create_drive(sc, d); } else { G_VINUM_DEBUG(0, "create sd '%s': drive '%s' not found", s->name, s->drive); g_free(s); return (GV_ERR_CREATE); } } /* Find the plex where this subdisk belongs to. */ p = gv_find_plex(sc, s->plex); if (p == NULL) { G_VINUM_DEBUG(0, "create sd '%s': plex '%s' not found", s->name, s->plex); g_free(s); return (GV_ERR_CREATE); } /* * First we give the subdisk to the drive, to handle autosized * values ... */ if (gv_sd_to_drive(s, d) != 0) { g_free(s); return (GV_ERR_CREATE); } /* * Then, we give the subdisk to the plex; we check if the * given values are correct and maybe adjust them. */ if (gv_sd_to_plex(s, p) != 0) { G_VINUM_DEBUG(0, "unable to give sd '%s' to plex '%s'", s->name, p->name); if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); gv_free_sd(s); g_free(s); /* * If this subdisk can't be created, we won't create * the attached plex either, if it is also a new one. */ if (!(p->flags & GV_PLEX_NEWBORN)) return (GV_ERR_CREATE); gv_rm_plex(sc, p); return (GV_ERR_CREATE); } s->flags |= GV_SD_NEWBORN; s->vinumconf = sc; LIST_INSERT_HEAD(&sc->subdisks, s, sd); return (0); } /* * Create a concatenated volume from specified drives or drivegroups. */ void gv_concat(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, dcount; sc = gp->softc; dcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_CONCAT; p->stripesize = 0; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Drives are first (right now) priority */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a mirrored volume from specified drives or drivegroups. */ void gv_mirror(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, *flags, dcount, pcount, scount; sc = gp->softc; dcount = 0; scount = 0; pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have an even number of drives. */ if (*drives % 2 != 0) { gctl_error(req, "mirror organization must have an even number " "of drives"); return; } if (*flags & GV_FLAG_S && *drives < 4) { gctl_error(req, "must have at least 4 drives for striped plex"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plexes. */ for (pcount = 0; pcount < 2; pcount++) { p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, pcount); strlcpy(p->volume, v->name, sizeof(p->volume)); if (*flags & GV_FLAG_S) { p->org = GV_PLEX_STRIPED; p->stripesize = DEFAULT_STRIPESIZE; } else { p->org = GV_PLEX_CONCAT; p->stripesize = -1; } gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* * We just gives each even drive to plex one, and each odd to * plex two. */ scount = 0; for (dcount = pcount; dcount < *drives; dcount += 2) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s', aborting", drive); scount++; break; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, scount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); scount++; } } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } void gv_raid5(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; int *drives, *flags, dcount; char *vol, *drive, buf[30]; off_t *stripesize; sc = gp->softc; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "no stripesize given"); return; } if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least three drives. */ if (*drives < 3) { gctl_error(req, "must have at least three drives for this " "plex organisation"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_RAID5; p->stripesize = *stripesize; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a striped volume from specified drives or drivegroups. */ void gv_stripe(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, *flags, dcount, pcount; sc = gp->softc; dcount = 0; pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least two drives. */ if (*drives < 2) { gctl_error(req, "must have at least 2 drives"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_STRIPED; p->stripesize = 262144; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } Index: head/sys/geom/vinum/geom_vinum_drive.c =================================================================== --- head/sys/geom/vinum/geom_vinum_drive.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_drive.c (revision 350694) @@ -1,354 +1,355 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #define GV_LEGACY_I386 0 #define GV_LEGACY_AMD64 1 #define GV_LEGACY_SPARC64 2 #define GV_LEGACY_POWERPC 3 static int gv_legacy_header_type(uint8_t *, int); /* * Here are the "offset (size)" for the various struct gv_hdr fields, * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and * current (cpu & endian agnostic) versions of the on-disk format of the vinum * header structure: * * i386 amd64 current field * -------- -------- -------- ----- * 0 ( 8) 0 ( 8) 0 ( 8) magic * 8 ( 4) 8 ( 8) 8 ( 8) config_length * 12 (32) 16 (32) 16 (32) label.sysname * 44 (32) 48 (32) 48 (32) label.name * 76 ( 4) 80 ( 8) 80 ( 8) label.date_of_birth.tv_sec * 80 ( 4) 88 ( 8) 88 ( 8) label.date_of_birth.tv_usec * 84 ( 4) 96 ( 8) 96 ( 8) label.last_update.tv_sec * 88 ( 4) 104 ( 8) 104 ( 8) label.last_update.tv_usec * 92 ( 8) 112 ( 8) 112 ( 8) label.drive_size * ======== ======== ======== * 100 120 120 total size * * NOTE: i386 and amd64 formats are stored as little-endian; the current * format uses big-endian (network order). */ /* Checks for legacy format depending on platform. */ static int gv_legacy_header_type(uint8_t *hdr, int bigendian) { uint32_t *i32; int arch_32, arch_64, i; /* Set arch according to endianness. */ if (bigendian) { arch_32 = GV_LEGACY_POWERPC; arch_64 = GV_LEGACY_SPARC64; } else { arch_32 = GV_LEGACY_I386; arch_64 = GV_LEGACY_AMD64; } /* if non-empty hostname overlaps 64-bit config_length */ i32 = (uint32_t *)(hdr + 12); if (*i32 != 0) return (arch_32); /* check for non-empty hostname */ if (hdr[16] != 0) return (arch_64); /* check bytes past 32-bit structure */ for (i = 100; i < 120; i++) if (hdr[i] != 0) return (arch_32); /* check for overlapping timestamp */ i32 = (uint32_t *)(hdr + 84); if (*i32 == 0) return (arch_64); return (arch_32); } /* * Read the header while taking magic number into account, and write it to * destination pointer. */ int gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { struct g_provider *pp; uint64_t magic_machdep; uint8_t *d_hdr; int be, off; #define GV_GET32(endian) \ endian##32toh(*((uint32_t *)&d_hdr[off])); \ off += 4 #define GV_GET64(endian) \ endian##64toh(*((uint64_t *)&d_hdr[off])); \ off += 8 KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr")); KASSERT(cp != NULL, ("gv_read_header: null cp")); pp = cp->provider; KASSERT(pp != NULL, ("gv_read_header: null pp")); if ((GV_HDR_OFFSET % pp->sectorsize) != 0 || (GV_HDR_LEN % pp->sectorsize) != 0) return (ENODEV); d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); if (d_hdr == NULL) return (-1); off = 0; m_hdr->magic = GV_GET64(be); magic_machdep = *((uint64_t *)&d_hdr[0]); /* * The big endian machines will have a reverse of GV_OLD_MAGIC, so we * need to decide if we are running on a big endian machine as well as * checking the magic against the reverse of GV_OLD_MAGIC. */ be = (m_hdr->magic == magic_machdep); if (m_hdr->magic == GV_MAGIC) { m_hdr->config_length = GV_GET64(be); off = 16; bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (m_hdr->magic != GV_OLD_MAGIC && m_hdr->magic != le64toh(GV_OLD_MAGIC)) { /* Not a gvinum drive. */ g_free(d_hdr); return (-1); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) { G_VINUM_DEBUG(1, "detected legacy sparc64 header"); m_hdr->magic = GV_MAGIC; /* Legacy sparc64 on-disk header */ m_hdr->config_length = GV_GET64(be); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) { G_VINUM_DEBUG(1, "detected legacy PowerPC header"); m_hdr->magic = GV_MAGIC; /* legacy 32-bit big endian on-disk header */ m_hdr->config_length = GV_GET32(be); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(be); m_hdr->label.date_of_birth.tv_usec = GV_GET32(be); m_hdr->label.last_update.tv_sec = GV_GET32(be); m_hdr->label.last_update.tv_usec = GV_GET32(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) { G_VINUM_DEBUG(1, "detected legacy i386 header"); m_hdr->magic = GV_MAGIC; /* legacy i386 on-disk header */ m_hdr->config_length = GV_GET32(le); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(le); m_hdr->label.date_of_birth.tv_usec = GV_GET32(le); m_hdr->label.last_update.tv_sec = GV_GET32(le); m_hdr->label.last_update.tv_usec = GV_GET32(le); m_hdr->label.drive_size = GV_GET64(le); } else { G_VINUM_DEBUG(1, "detected legacy amd64 header"); m_hdr->magic = GV_MAGIC; /* legacy amd64 on-disk header */ m_hdr->config_length = GV_GET64(le); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(le); m_hdr->label.date_of_birth.tv_usec = GV_GET64(le); m_hdr->label.last_update.tv_sec = GV_GET64(le); m_hdr->label.last_update.tv_usec = GV_GET64(le); m_hdr->label.drive_size = GV_GET64(le); } g_free(d_hdr); return (0); } /* Write out the gvinum header. */ int gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { uint8_t d_hdr[GV_HDR_LEN]; int off, ret; #define GV_SET64BE(field) \ do { \ *((uint64_t *)&d_hdr[off]) = htobe64(field); \ off += 8; \ } while (0) KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr")); off = 0; memset(d_hdr, 0, GV_HDR_LEN); GV_SET64BE(m_hdr->magic); GV_SET64BE(m_hdr->config_length); off = 16; bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; GV_SET64BE(m_hdr->label.date_of_birth.tv_sec); GV_SET64BE(m_hdr->label.date_of_birth.tv_usec); GV_SET64BE(m_hdr->label.last_update.tv_sec); GV_SET64BE(m_hdr->label.last_update.tv_usec); GV_SET64BE(m_hdr->label.drive_size); ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN); return (ret); } /* Save the vinum configuration back to each involved disk. */ void gv_save_config(struct gv_softc *sc) { struct g_consumer *cp; struct gv_drive *d; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; struct timeval last_update; int error; KASSERT(sc != NULL, ("gv_save_config: null sc")); vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; microtime(&last_update); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); LIST_FOREACH(d, &sc->drives, drive) { /* * We can't save the config on a drive that isn't up, but * drives that were just created aren't officially up yet, so * we check a special flag. */ if (d->state != GV_DRIVE_UP) continue; cp = d->consumer; if (cp == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no consumer!", d->name); continue; } hdr = d->hdr; if (hdr == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no header", d->name); g_free(vhdr); continue; } bcopy(&last_update, &hdr->label.last_update, sizeof(struct timeval)); bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); g_topology_lock(); error = g_access(cp, 0, 1, 0); if (error) { G_VINUM_DEBUG(0, "g_access failed on " "drive %s, errno %d", d->name, error); g_topology_unlock(); continue; } g_topology_unlock(); error = gv_write_header(cp, vhdr); if (error) { G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, " "errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* First config copy. */ error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb), GV_CFG_LEN); if (error) { G_VINUM_DEBUG(0, "writing first config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* Second config copy. */ error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN, sbuf_data(sb), GV_CFG_LEN); if (error) G_VINUM_DEBUG(0, "writing second config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); } sbuf_delete(sb); g_free(vhdr); } Index: head/sys/geom/vinum/geom_vinum_events.c =================================================================== --- head/sys/geom/vinum/geom_vinum_events.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_events.c (revision 350694) @@ -1,262 +1,263 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include void gv_post_event(struct gv_softc *sc, int event, void *arg1, void *arg2, intmax_t arg3, intmax_t arg4) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = event; ev->arg1 = arg1; ev->arg2 = arg2; ev->arg3 = arg3; ev->arg4 = arg4; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); mtx_unlock(&sc->equeue_mtx); } void gv_worker_exit(struct gv_softc *sc) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = GV_EVENT_THREAD_EXIT; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); msleep(sc->worker, &sc->equeue_mtx, PDROP, "gv_wor", 0); } struct gv_event * gv_get_event(struct gv_softc *sc) { struct gv_event *ev; KASSERT(sc != NULL, ("NULL sc")); mtx_lock(&sc->equeue_mtx); ev = TAILQ_FIRST(&sc->equeue); mtx_unlock(&sc->equeue_mtx); return (ev); } void gv_remove_event(struct gv_softc *sc, struct gv_event *ev) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(ev != NULL, ("NULL ev")); mtx_lock(&sc->equeue_mtx); TAILQ_REMOVE(&sc->equeue, ev, events); mtx_unlock(&sc->equeue_mtx); } void gv_drive_tasted(struct gv_softc *sc, struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; struct gv_hdr *hdr; struct gv_drive *d; char *buf; int error; hdr = NULL; buf = NULL; G_VINUM_DEBUG(2, "tasted drive on '%s'", pp->name); if ((GV_CFG_OFFSET % pp->sectorsize) != 0 || (GV_CFG_LEN % pp->sectorsize) != 0) { G_VINUM_DEBUG(0, "provider %s has unsupported sectorsize.", pp->name); return; } gp = sc->geom; g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to attach to provider on taste event"); return; } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to access consumer on taste event"); return; } g_topology_unlock(); hdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); /* Read header and on-disk configuration. */ error = gv_read_header(cp, hdr); if (error) { G_VINUM_DEBUG(0, "failed to read header during taste"); goto failed; } /* * Setup the drive before we parse the on-disk configuration, so that * we already know about the drive then. */ d = gv_find_drive(sc, hdr->label.name); if (d == NULL) { d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); strlcpy(d->name, hdr->label.name, sizeof(d->name)); strlcpy(d->device, pp->name, sizeof(d->device)); } else if (d->flags & GV_DRIVE_REFERENCED) { strlcpy(d->device, pp->name, sizeof(d->device)); d->flags &= ~GV_DRIVE_REFERENCED; } else { G_VINUM_DEBUG(2, "drive '%s' is already known", d->name); goto failed; } /* Add the consumer and header to the new drive. */ d->consumer = cp; d->hdr = hdr; gv_create_drive(sc, d); buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL); if (buf == NULL) { G_VINUM_DEBUG(0, "failed to read config during taste"); goto failed; } gv_parse_config(sc, buf, d); g_free(buf); g_topology_lock(); g_access(cp, -1, 0, 0); g_topology_unlock(); gv_setup_objects(sc); gv_set_drive_state(d, GV_DRIVE_UP, 0); return; failed: if (hdr != NULL) g_free(hdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } /* * When losing a drive (e.g. hardware failure), we cut down the consumer * attached to the underlying device and bring the drive itself to a * "referenced" state so that normal tasting could bring it up cleanly if it * possibly arrives again. */ void gv_drive_lost(struct gv_softc *sc, struct gv_drive *d) { struct g_consumer *cp; struct gv_drive *d2; struct gv_sd *s, *s2; struct gv_freelist *fl, *fl2; gv_set_drive_state(d, GV_DRIVE_DOWN, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); cp = d->consumer; if (cp != NULL) { if (cp->nstart != cp->nend) { G_VINUM_DEBUG(0, "dead drive '%s' has still active " "requests, unable to detach consumer", d->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); return; } g_topology_lock(); if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } d->consumer = NULL; g_free(d->hdr); d->hdr = NULL; d->flags |= GV_DRIVE_REFERENCED; snprintf(d->device, sizeof(d->device), "???"); d->size = 0; d->avail = 0; d->freelist_entries = 0; d->sdcount = 0; /* Put the subdisk in tasted mode, and remove from drive list. */ LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { LIST_REMOVE(s, from_drive); s->flags |= GV_SD_TASTED; } /* * Don't forget that gv_is_newer wants a "real" drive at the beginning * of the list, so, just to be safe, we shuffle around. */ LIST_REMOVE(d, drive); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); gv_save_config(sc); } Index: head/sys/geom/vinum/geom_vinum_init.c =================================================================== --- head/sys/geom/vinum/geom_vinum_init.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_init.c (revision 350694) @@ -1,390 +1,391 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include static int gv_sync(struct gv_volume *); static int gv_rebuild_plex(struct gv_plex *); static int gv_init_plex(struct gv_plex *); static int gv_grow_plex(struct gv_plex *); static int gv_sync_plex(struct gv_plex *, struct gv_plex *); static struct gv_plex *gv_find_good_plex(struct gv_volume *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; int *argc, *initsize; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } sc = gp->softc; for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); if (v != NULL) gv_post_event(sc, GV_EVENT_START_VOLUME, v, NULL, *initsize, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); if (p != NULL) gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL, *initsize, 0); break; case GV_TYPE_SD: case GV_TYPE_DRIVE: /* XXX Not implemented, but what is the use? */ gctl_error(req, "unable to start '%s' - not yet supported", argv); return; default: gctl_error(req, "unknown object '%s'", argv); return; } } } int gv_start_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_plex *up; struct gv_sd *s; int error; KASSERT(p != NULL, ("gv_start_plex: NULL p")); error = 0; v = p->vol_sc; /* RAID5 plexes can either be init, rebuilt or grown. */ if (p->org == GV_PLEX_RAID5) { if (p->state > GV_PLEX_DEGRADED) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); return (error); } } } else if (p->state == GV_PLEX_DEGRADED) { error = gv_rebuild_plex(p); } else error = gv_init_plex(p); } else { /* We want to sync from the other plex if we're down. */ if (p->state == GV_PLEX_DOWN && v->plexcount > 1) { up = gv_find_good_plex(v); if (up == NULL) { G_VINUM_DEBUG(1, "unable to find a good plex"); return (ENXIO); } g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to " "access volume: %d", up->name, error); return (error); } g_topology_unlock(); error = gv_sync_plex(p, up); if (error) return (error); /* * In case we have a stripe that is up, check whether it can be * grown. */ } else if (p->org == GV_PLEX_STRIPED && p->state != GV_PLEX_DOWN) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); break; } } } } return (error); } int gv_start_vol(struct gv_volume *v) { struct gv_plex *p; int error; KASSERT(v != NULL, ("gv_start_vol: NULL v")); error = 0; if (v->plexcount == 0) return (ENXIO); else if (v->plexcount == 1) { p = LIST_FIRST(&v->plexes); KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); error = gv_start_plex(p); } else error = gv_sync(v); return (error); } /* Sync a plex p from the plex up. */ static int gv_sync_plex(struct gv_plex *p, struct gv_plex *up) { int error; KASSERT(p != NULL, ("%s: NULL p", __func__)); KASSERT(up != NULL, ("%s: NULL up", __func__)); if ((p == up) || (p->state == GV_PLEX_UP)) return (0); if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { return (EINPROGRESS); } p->synced = 0; p->flags |= GV_PLEX_SYNCING; G_VINUM_DEBUG(1, "starting sync of plex %s", p->name); error = gv_sync_request(up, p, p->synced, MIN(GV_DFLT_SYNCSIZE, up->size - p->synced), BIO_READ, NULL); if (error) { G_VINUM_DEBUG(0, "error syncing plex %s", p->name); return (error); } return (0); } /* Return a good plex from volume v. */ static struct gv_plex * gv_find_good_plex(struct gv_volume *v) { struct gv_plex *up; /* Find the plex that's up. */ up = NULL; LIST_FOREACH(up, &v->plexes, in_volume) { if (up->state == GV_PLEX_UP) break; } /* Didn't find a good plex. */ return (up); } static int gv_sync(struct gv_volume *v) { struct gv_softc *sc; struct gv_plex *p, *up; int error; KASSERT(v != NULL, ("gv_sync: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name)); up = gv_find_good_plex(v); if (up == NULL) return (ENXIO); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to access volume: %d", up->name, error); return (error); } g_topology_unlock(); /* Go through the good plex, and issue BIO's to all other plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { error = gv_sync_plex(p, up); if (error) break; } return (0); } static int gv_rebuild_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) return (EINPROGRESS); /* * Make sure that all subdisks have consumers. We won't allow a rebuild * unless every subdisk have one. */ LIST_FOREACH(s, &p->subdisks, in_plex) { d = s->drive_sc; if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) { G_VINUM_DEBUG(0, "unable to rebuild %s, subdisk(s) have" " no drives", p->name); return (ENXIO); } } p->flags |= GV_PLEX_REBUILDING; p->synced = 0; g_topology_assert_not(); g_topology_lock(); error = gv_access(p->vol_sc->provider, 1, 1, 0); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (0); } g_topology_unlock(); gv_parity_request(p, GV_BIO_REBUILD, 0); return (0); } static int gv_grow_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s; off_t origsize, origlength; int error, sdcount; KASSERT(p != NULL, ("gv_grow_plex: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_plex: NULL v")); if (p->flags & GV_PLEX_GROWING || p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) return (EINPROGRESS); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (error); } /* XXX: This routine with finding origsize is used two other places as * well, so we should create a function for it. */ sdcount = p->sdcount; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } s = LIST_FIRST(&p->subdisks); if (s == NULL) { G_VINUM_DEBUG(0, "error growing plex without subdisks"); return (GV_ERR_NOTFOUND); } p->flags |= GV_PLEX_GROWING; origsize = (sdcount - 1) * s->size; origlength = (sdcount - 1) * p->stripesize; p->synced = 0; G_VINUM_DEBUG(1, "starting growing of plex %s", p->name); gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); return (0); } static int gv_init_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; off_t start; caddr_t data; KASSERT(p != NULL, ("gv_init_plex: NULL p")); LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state == GV_SD_INITIALIZING) return (EINPROGRESS); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); s->init_size = GV_DFLT_SYNCSIZE; start = s->drive_offset + s->initialized; d = s->drive_sc; if (d == NULL) { G_VINUM_DEBUG(0, "subdisk %s has no drive yet", s->name); break; } /* * Take the lock here since we need to avoid a race in * gv_init_request if the BIO is completed before the lock is * released. */ g_topology_lock(); error = g_access(d->consumer, 0, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "error accessing consumer when " "initializing %s", s->name); break; } data = g_malloc(s->init_size, M_WAITOK | M_ZERO); gv_init_request(s, start, data, s->init_size); } return (0); } Index: head/sys/geom/vinum/geom_vinum_move.c =================================================================== --- head/sys/geom/vinum/geom_vinum_move.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_move.c (revision 350694) @@ -1,190 +1,191 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Chris Jones * All rights reserved. * * This software was developed for the FreeBSD Project by Chris Jones * thanks to the support of Google's Summer of Code program and * mentoring by Lukas Ertl. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include void gv_move(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; char buf[20], *destination, *object; int *argc, *flags, i, type; sc = gp->softc; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } destination = gctl_get_param(req, "destination", NULL); if (destination == NULL) { gctl_error(req, "no destination given"); return; } if (gv_object_type(sc, destination) != GV_TYPE_DRIVE) { gctl_error(req, "destination '%s' is not a drive", destination); return; } d = gv_find_drive(sc, destination); /* * We start with 1 here, because argv[0] on the command line is the * destination drive. */ for (i = 1; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); object = gctl_get_param(req, buf, NULL); if (object == NULL) continue; type = gv_object_type(sc, object); if (type != GV_TYPE_SD) { gctl_error(req, "you can only move subdisks; " "'%s' is not a subdisk", object); return; } s = gv_find_sd(sc, object); if (s == NULL) { gctl_error(req, "unknown subdisk '%s'", object); return; } gv_post_event(sc, GV_EVENT_MOVE_SD, s, d, *flags, 0); } } /* Move a subdisk. */ int gv_move_sd(struct gv_softc *sc, struct gv_sd *cursd, struct gv_drive *destination, int flags) { struct gv_drive *d; struct gv_sd *newsd, *s, *s2; struct gv_plex *p; int err; g_topology_assert(); KASSERT(cursd != NULL, ("gv_move_sd: NULL cursd")); KASSERT(destination != NULL, ("gv_move_sd: NULL destination")); d = cursd->drive_sc; if ((gv_consumer_is_open(d->consumer) || gv_consumer_is_open(destination->consumer)) && !(flags & GV_FLAG_F)) { G_VINUM_DEBUG(0, "consumers on current and destination drive " " still open"); return (GV_ERR_ISBUSY); } if (!(flags & GV_FLAG_F)) { G_VINUM_DEBUG(1, "-f flag not passed; move would be " "destructive"); return (GV_ERR_INVFLAG); } if (destination == cursd->drive_sc) { G_VINUM_DEBUG(1, "subdisk '%s' already on drive '%s'", cursd->name, destination->name); return (GV_ERR_ISATTACHED); } /* XXX: Does it have to be part of a plex? */ p = gv_find_plex(sc, cursd->plex); if (p == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' is not part of a plex", cursd->name); return (GV_ERR_NOTFOUND); } /* Stale the old subdisk. */ err = gv_set_sd_state(cursd, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); if (err) { G_VINUM_DEBUG(0, "unable to set the subdisk '%s' to state " "'stale'", cursd->name); return (err); } /* * Create new subdisk. Ideally, we'd use gv_new_sd, but that requires * us to create a string for it to parse, which is silly. * TODO: maybe refactor gv_new_sd such that this is no longer the case. */ newsd = g_malloc(sizeof(struct gv_sd), M_WAITOK | M_ZERO); newsd->plex_offset = cursd->plex_offset; newsd->size = cursd->size; newsd->drive_offset = -1; strlcpy(newsd->name, cursd->name, sizeof(newsd->name)); strlcpy(newsd->drive, destination->name, sizeof(newsd->drive)); strlcpy(newsd->plex, cursd->plex, sizeof(newsd->plex)); newsd->state = GV_SD_STALE; newsd->vinumconf = cursd->vinumconf; err = gv_sd_to_drive(newsd, destination); if (err) { /* XXX not enough free space? */ g_free(newsd); return (err); } /* Replace the old sd by the new one. */ LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { if (s == cursd) { gv_rm_sd(sc, s); } } gv_sd_to_plex(newsd, p); LIST_INSERT_HEAD(&sc->subdisks, newsd, sd); /* Update volume size of plex. */ if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); return (0); } Index: head/sys/geom/vinum/geom_vinum_plex.c =================================================================== --- head/sys/geom/vinum/geom_vinum_plex.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_plex.c (revision 350694) @@ -1,1050 +1,1051 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include static int gv_check_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static int gv_normal_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static void gv_plex_flush(struct gv_plex *); static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int); static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, off_t, caddr_t); static void gv_post_bio(struct gv_softc *, struct bio *); void gv_plex_start(struct gv_plex *p, struct bio *bp) { struct bio *cbp; struct gv_sd *s; struct gv_raid5_packet *wp; caddr_t addr; off_t bcount, boff, len; bcount = bp->bio_length; addr = bp->bio_data; boff = bp->bio_offset; /* Walk over the whole length of the request, we might split it up. */ while (bcount > 0) { wp = NULL; /* * RAID5 plexes need special treatment, as a single request * might involve several read/write sub-requests. */ if (p->org == GV_PLEX_RAID5) { wp = gv_raid5_start(p, bp, addr, boff, bcount); if (wp == NULL) return; len = wp->length; if (TAILQ_EMPTY(&wp->bits)) g_free(wp); else if (wp->lockbase != -1) TAILQ_INSERT_TAIL(&p->packets, wp, list); /* * Requests to concatenated and striped plexes go straight * through. */ } else { len = gv_plex_normal_request(p, bp, boff, bcount, addr); } if (len < 0) return; bcount -= len; addr += len; boff += len; } /* * Fire off all sub-requests. We get the correct consumer (== drive) * to send each request to via the subdisk that was stored in * cbp->bio_caller1. */ cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { /* * RAID5 sub-requests need to come in correct order, otherwise * we trip over the parity, as it might be overwritten by * another sub-request. We abuse cbp->bio_caller2 to mark * potential overlap situations. */ if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { /* Park the bio on the waiting queue. */ cbp->bio_pflags |= GV_BIO_ONHOLD; bioq_disksort(p->wqueue, cbp); } else { s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); } cbp = bioq_takefirst(p->bqueue); } } static int gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int growing) { struct gv_sd *s; int i, sdcount; off_t len_left, stripeend, stripeno, stripestart; switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && s->plex_offset + s->size > boff) { *sdno = i; break; } i++; } if (s == NULL || s->drive_sc == NULL) return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ *real_off = boff - s->plex_offset; len_left = s->size - (*real_off); KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); /* Take growing subdisks into account when calculating. */ sdcount = gv_sdcount(p, (boff >= p->synced)); if (!(boff + bcount <= p->synced) && (p->flags & GV_PLEX_GROWING) && !growing) return (GV_ERR_ISBUSY); *sdno = stripeno % sdcount; KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); stripestart = (stripeno / sdcount) * p->stripesize; KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); stripeend = stripestart + p->stripesize; *real_off = boff - (stripeno * p->stripesize) + stripestart; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; break; default: return (GV_ERR_PLEXORG); } return (0); } /* * Prepare a normal plex request. */ static int gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, off_t bcount, caddr_t addr) { struct gv_sd *s; struct bio *cbp; off_t real_len, real_off; int i, err, sdno; s = NULL; sdno = -1; real_len = real_off = 0; err = ENXIO; if (p == NULL || LIST_EMPTY(&p->subdisks)) goto bad; err = gv_plex_offset(p, boff, bcount, &real_off, &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW)); /* If the request was blocked, put it into wait. */ if (err == GV_ERR_ISBUSY) { bioq_disksort(p->rqueue, bp); return (-1); /* "Fail", and delay request. */ } if (err) { err = ENXIO; goto bad; } err = ENXIO; /* Find the right subdisk. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) break; i++; } /* Subdisk not found. */ if (s == NULL || s->drive_sc == NULL) goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; case GV_SD_DOWN: if (bp->bio_pflags & GV_BIO_INTERNAL) G_VINUM_DEBUG(0, "subdisk must be in the stale state in" " order to perform administrative requests"); goto bad; case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) { G_VINUM_DEBUG(0, "subdisk stale, unable to perform " "regular requests"); goto bad; } G_VINUM_DEBUG(1, "sd %s is initializing", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) goto bad; break; default: /* All other subdisk states mean it's not accessible. */ goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); if (cbp == NULL) { err = ENOMEM; goto bad; } cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; cbp->bio_done = gv_done; cbp->bio_caller1 = s; /* Store the sub-requests now and let others issue them. */ bioq_insert_tail(p->bqueue, cbp); return (real_len); bad: G_VINUM_LOGREQ(0, bp, "plex request failed."); /* Building the sub-request failed. If internal BIO, do not deliver. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (-1); } g_io_deliver(bp, err); return (-1); } /* * Handle a completed request to a striped or concatenated plex. */ void gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { /* Just set it to length since multiple plexes will * screw things up. */ pbp->bio_completed = pbp->bio_length; if (pbp->bio_pflags & GV_BIO_SYNCREQ) gv_sync_complete(p, pbp); else if (pbp->bio_pflags & GV_BIO_GROW) gv_grow_complete(p, pbp); else g_io_deliver(pbp, pbp->bio_error); } } /* * Handle a completed request to a RAID-5 plex. */ void gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct bio *cbp, *pbp; struct gv_bioq *bq, *bq2; struct gv_raid5_packet *wp; off_t completed; int i; completed = 0; sc = p->vinumconf; wp = bp->bio_caller2; switch (bp->bio_parent->bio_cmd) { case BIO_READ: if (wp == NULL) { completed = bp->bio_completed; break; } TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); for (i = 0; i < wp->length; i++) wp->data[i] ^= bp->bio_data[i]; break; } if (TAILQ_EMPTY(&wp->bits)) { completed = wp->length; if (wp->lockbase != -1) { TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } } g_free(wp); } break; case BIO_WRITE: /* XXX can this ever happen? */ if (wp == NULL) { completed = bp->bio_completed; break; } /* Check if we need to handle parity data. */ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); cbp = wp->parity; if (cbp != NULL) { for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= bp->bio_data[i]; } break; } /* Handle parity data. */ if (TAILQ_EMPTY(&wp->bits)) { if (bp->bio_parent->bio_pflags & GV_BIO_CHECK) i = gv_check_parity(p, bp, wp); else i = gv_normal_parity(p, bp, wp); /* All of our sub-requests have finished. */ if (i) { completed = wp->length; TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } g_free(wp); } } break; } pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += completed; /* When the original request is finished, we deliver it. */ pbp->bio_inbed++; if (pbp->bio_inbed == pbp->bio_children) { /* Hand it over for checking or delivery. */ if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_CHECK)) { gv_parity_complete(p, pbp); } else if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_REBUILD)) { gv_rebuild_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_INIT) { gv_init_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { gv_sync_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_GROW) { gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } } /* Clean up what we allocated. */ if (bp->bio_cflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; struct gv_sd *s; int err, finished, i; err = 0; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { pbp = wp->parity; wp->parity = NULL; /* Check if the parity is correct. */ for (i = 0; i < wp->length; i++) { if (bp->bio_data[i] != pbp->bio_data[i]) { err = 1; break; } } /* The parity is not correct... */ if (err) { bp->bio_parent->bio_error = EAGAIN; /* ... but we rebuild it. */ if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) { s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } /* * Clean up the BIO we would have used for rebuilding the * parity. */ if (finished) { bp->bio_parent->bio_inbed++; g_destroy_bio(pbp); } } return (finished); } static int gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; struct gv_sd *s; int finished, i; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; cbp = wp->parity; for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= pbp->bio_data[i]; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { cbp = wp->parity; wp->parity = NULL; s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); finished = 0; } return (finished); } /* Flush the queue with delayed requests. */ static void gv_plex_flush(struct gv_plex *p) { struct gv_softc *sc; struct bio *bp; sc = p->vinumconf; bp = bioq_takefirst(p->rqueue); while (bp != NULL) { gv_plex_start(p, bp); bp = bioq_takefirst(p->rqueue); } } static void gv_post_bio(struct gv_softc *sc, struct bio *bp) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(bp != NULL, ("NULL bp")); mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(from != NULL, ("NULL from")); KASSERT(to != NULL, ("NULL to")); sc = from->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "sync from '%s' failed at offset " " %jd; out of memory", from->name, offset); return (ENOMEM); } bp->bio_length = length; bp->bio_done = gv_done; bp->bio_pflags |= GV_BIO_SYNCREQ; bp->bio_offset = offset; bp->bio_caller1 = from; bp->bio_caller2 = to; bp->bio_cmd = type; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */ bp->bio_data = data; /* Send down next. */ gv_post_bio(sc, bp); //gv_plex_start(from, bp); return (0); } /* * Handle a finished plex sync bio. */ int gv_sync_complete(struct gv_plex *to, struct bio *bp) { struct gv_plex *from, *p; struct gv_sd *s; struct gv_volume *v; struct gv_softc *sc; off_t offset; int err; g_topology_assert_not(); err = 0; KASSERT(to != NULL, ("NULL to")); KASSERT(bp != NULL, ("NULL bp")); from = bp->bio_caller2; KASSERT(from != NULL, ("NULL from")); v = to->vol_sc; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read the next one. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); to->synced += bp->bio_length; /* If we're finished, clean up. */ if (bp->bio_offset + bp->bio_length >= from->size) { G_VINUM_DEBUG(1, "syncing of %s from %s completed", to->name, from->name); /* Update our state. */ LIST_FOREACH(s, &to->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, 0); gv_update_plex_state(to); to->flags &= ~GV_PLEX_SYNCING; to->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } else { offset = bp->bio_offset + bp->bio_length; err = gv_sync_request(from, to, offset, MIN(bp->bio_length, from->size - offset), BIO_READ, NULL); } } g_destroy_bio(bp); /* Clean up if there was an error. */ if (err) { to->flags &= ~GV_PLEX_SYNCING; G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err); } /* Check if all plexes are synced, and lower refcounts. */ g_topology_lock(); LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { g_topology_unlock(); return (-1); } } /* If we came here, all plexes are synced, and we're free. */ gv_access(v->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "plex sync completed"); gv_volume_flush(v); return (0); } /* * Create a new bio struct for the next grow request. */ int gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_grow_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_grow_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "grow of %s failed creating bio: " "out of memory", p->name); return (ENOMEM); } bp->bio_cmd = type; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_caller1 = p; bp->bio_offset = offset; bp->bio_length = length; bp->bio_pflags |= GV_BIO_GROW; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; bp->bio_data = data; gv_post_bio(sc, bp); //gv_plex_start(p, bp); return (0); } /* * Finish handling of a bio to a growing plex. */ void gv_grow_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; struct gv_volume *v; off_t origsize, offset; int sdcount, err; v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_complete: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_grow_complete: NULL sc")); err = 0; /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { p->synced += bp->bio_length; err = gv_grow_request(p, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read next. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); /* Find the real size of the plex. */ sdcount = gv_sdcount(p, 1); s = LIST_FIRST(&p->subdisks); KASSERT(s != NULL, ("NULL s")); origsize = (s->size * (sdcount - 1)); if (bp->bio_offset + bp->bio_length >= origsize) { G_VINUM_DEBUG(1, "growing of %s completed", p->name); p->flags &= ~GV_PLEX_GROWING; LIST_FOREACH(s, &p->subdisks, in_plex) { s->flags &= ~GV_SD_GROW; gv_set_sd_state(s, GV_SD_UP, 0); } p->size = gv_plex_size(p); gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); g_topology_lock(); gv_access(v->provider, -1, -1, 0); g_topology_unlock(); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Issue delayed requests. */ gv_plex_flush(p); } else { offset = bp->bio_offset + bp->bio_length; err = gv_grow_request(p, offset, MIN(bp->bio_length, origsize - offset), BIO_READ, NULL); } } g_destroy_bio(bp); if (err) { p->flags &= ~GV_PLEX_GROWING; G_VINUM_DEBUG(0, "error growing plex: error code %d", err); } } /* * Create an initialization BIO and send it off to the consumer. Assume that * we're given initialization data as parameter. */ void gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) { struct gv_drive *d; struct g_consumer *cp; struct bio *bp, *cbp; KASSERT(s != NULL, ("gv_init_request: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_request: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_request: NULL cp")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } bp->bio_cmd = BIO_WRITE; bp->bio_data = data; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_length = length; bp->bio_pflags |= GV_BIO_INIT; bp->bio_offset = start; bp->bio_caller1 = s; /* Then ofcourse, we have to clone it. */ cbp = g_clone_bio(bp); if (cbp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } cbp->bio_done = gv_done; cbp->bio_caller1 = s; /* Send it off to the consumer. */ g_io_request(cbp, cp); } /* * Handle a finished initialization BIO. */ void gv_init_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_drive *d; struct g_consumer *cp; struct gv_sd *s; off_t start, length; caddr_t data; int error; s = bp->bio_caller1; start = bp->bio_offset; length = bp->bio_length; error = bp->bio_error; data = bp->bio_data; KASSERT(s != NULL, ("gv_init_complete: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_complete: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_init_complete: NULL sc")); g_destroy_bio(bp); /* * First we need to find out if it was okay, and abort if it's not. * Then we need to free previous buffers, find out the correct subdisk, * as well as getting the correct starting point and length of the BIO. */ if (start >= s->drive_offset + s->size) { /* Free the data we initialized. */ if (data != NULL) g_free(data); g_topology_assert_not(); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); if (error) { gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); } else { gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); s->initialized = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); G_VINUM_DEBUG(1, "subdisk '%s' init: finished " "successfully", s->name); } return; } s->initialized += length; start += length; gv_init_request(s, start, data, length); } /* * Create a new bio struct for the next parity rebuild. Used both by internal * rebuild of degraded plexes as well as user initiated rebuilds/checks. */ void gv_parity_request(struct gv_plex *p, int flags, off_t offset) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_parity_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: " "out of memory", p->name); return; } bp->bio_cmd = BIO_WRITE; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_length = p->stripesize; bp->bio_caller1 = p; /* * Check if it's a rebuild of a degraded plex or a user request of * parity rebuild. */ if (flags & GV_BIO_REBUILD) bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); else if (flags & GV_BIO_CHECK) bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); else { G_VINUM_DEBUG(0, "invalid flags given in rebuild"); return; } bp->bio_pflags = flags; bp->bio_pflags |= GV_BIO_MALLOC; /* We still have more parity to build. */ bp->bio_offset = offset; gv_post_bio(sc, bp); //gv_plex_start(p, bp); /* Send it down to the plex. */ } /* * Handle a finished parity write. */ void gv_parity_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; int error, flags; error = bp->bio_error; flags = bp->bio_pflags; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error == EAGAIN) { G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx", (intmax_t)p->synced); } /* Any error is fatal, except EAGAIN when we're rebuilding. */ if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx " "errno %d", p->name, (intmax_t)p->synced, error); return; } else { p->synced += p->stripesize; } if (p->synced >= p->size) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); /* We're finished. */ G_VINUM_DEBUG(1, "parity operation on %s finished", p->name); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, p->synced); } /* * Handle a finished plex rebuild bio. */ void gv_rebuild_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; int error, flags; off_t offset; error = bp->bio_error; flags = bp->bio_pflags; offset = bp->bio_offset; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error) { g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d", p->name, (intmax_t)offset, error); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); if (offset >= p->size) { /* We're finished. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "rebuild of %s finished", p->name); gv_save_config(p->vinumconf); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; /* Try to up all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) gv_update_sd_state(s); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, offset); } Index: head/sys/geom/vinum/geom_vinum_raid5.c =================================================================== --- head/sys/geom/vinum/geom_vinum_raid5.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_raid5.c (revision 350694) @@ -1,663 +1,664 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *, int); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t, int *); static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); struct gv_raid5_packet * gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct bio *cbp; struct gv_raid5_packet *wp, *wp2; struct gv_bioq *bq, *bq2; int err, delay; delay = 0; wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); wp->bio = bp; wp->waiting = NULL; wp->parity = NULL; TAILQ_INIT(&wp->bits); if (bp->bio_pflags & GV_BIO_REBUILD) err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); else if (bp->bio_pflags & GV_BIO_CHECK) err = gv_raid5_check(p, wp, bp, addr, boff, bcount); else err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); /* Means we have a delayed request. */ if (delay) { g_free(wp); return (NULL); } /* * Building the sub-request failed, we probably need to clean up a lot. */ if (err) { G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } if (wp->waiting != NULL) { if (wp->waiting->bio_cflags & GV_BIO_MALLOC) g_free(wp->waiting->bio_data); g_destroy_bio(wp->waiting); } if (wp->parity != NULL) { if (wp->parity->bio_cflags & GV_BIO_MALLOC) g_free(wp->parity->bio_data); g_destroy_bio(wp->parity); } g_free(wp); TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { if (wp->bio != bp) continue; TAILQ_REMOVE(&p->packets, wp, list); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } g_free(wp); } cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { if (cbp->bio_cflags & GV_BIO_MALLOC) g_free(cbp->bio_data); g_destroy_bio(cbp); cbp = bioq_takefirst(p->bqueue); } /* If internal, stop and reset state. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); /* Reset flags. */ p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (NULL); } g_io_deliver(bp, err); return (NULL); } return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_plex *p, struct bio *bp) { struct gv_raid5_packet *wp, *owp; int overlap; wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); overlap = 0; TAILQ_FOREACH(owp, &p->packets, list) { if (owp == wp) break; if ((wp->lockbase >= owp->lockbase) && (wp->lockbase <= owp->lockbase + owp->length)) { overlap++; break; } if ((wp->lockbase <= owp->lockbase) && (wp->lockbase + wp->length >= owp->lockbase)) { overlap++; break; } } return (overlap); } static int gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == psdno) { parity = s; break; } i++; } /* Parity stripe not found. */ if (parity == NULL) return (ENXIO); if (parity->state != GV_SD_UP) return (ENXIO); wp->length = real_len; wp->data = addr; wp->lockbase = real_off; /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity subdisk. */ if (s == parity) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Read the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ static int gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_UP) broken = s; } /* Broken stripe not found. */ if (broken == NULL) return (ENXIO); switch (broken->state) { case GV_SD_UP: return (EINVAL); case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_REBUILD)) return (ENXIO); G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); /* Set this bit now, but should be set at end. */ broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: break; default: /* All other subdisk states mean it's not accessible. */ return (ENXIO); } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; p->synced = boff; /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ static int gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno, sdno, type, grow; off_t real_len, real_off; gp = bp->bio_to->geom; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ #define REQ_TYPE_NORMAL 0 #define REQ_TYPE_DEGRADED 1 #define REQ_TYPE_NOPARITY 2 type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* XXX: The resize won't crash with rebuild or sync, but we should still * be aware of it. Also this should perhaps be done on rebuild/check as * well? */ /* If we're over, we must use the old. */ if (boff >= p->synced) { grow = 1; /* Or if over the resized offset, we use all drives. */ } else if (boff + bcount <= p->synced) { grow = 0; /* Else, we're in the middle, and must wait a bit. */ } else { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) type = REQ_TYPE_DEGRADED; /* If synchronizing request, just write it if disks are stale. */ if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { type = REQ_TYPE_NORMAL; /* Our parity stripe is missing. */ } else if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (type != REQ_TYPE_NORMAL) return (ENXIO); else type = REQ_TYPE_NOPARITY; } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (type == REQ_TYPE_DEGRADED) { bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (type == REQ_TYPE_DEGRADED) { /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { /* Read old parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); /* * We must not write the new data until the old data * was read, so hold this BIO back until we're ready * for it. */ wp->waiting = cbp; /* The final bio for the parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; } break; default: return (EINVAL); } return (0); } /* * Calculate the offsets in the various subdisks for a RAID5 request. Also take * care of new subdisks in an expanded RAID5 array. * XXX: This assumes that the new subdisks are inserted after the others (which * is okay as long as plex_offset is larger). If subdisks are inserted into the * plexlist before, we get problems. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int *psdno, int growing) { struct gv_sd *s; int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } /* The number of the subdisk containing the parity stripe. */ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sd = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); /* At or past parity subdisk. */ if (sd >= psd) sd++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ *real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; if (sdno != NULL) *sdno = sd; if (psdno != NULL) *psdno = psd; return (0); } static struct bio * gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, caddr_t addr, int use_wp) { struct bio *cbp; cbp = g_clone_bio(bp); if (cbp == NULL) return (NULL); if (addr == NULL) { cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); cbp->bio_cflags |= GV_BIO_MALLOC; } else cbp->bio_data = addr; cbp->bio_offset = wp->lockbase + s->drive_offset; cbp->bio_length = wp->length; cbp->bio_done = gv_done; cbp->bio_caller1 = s; if (use_wp) cbp->bio_caller2 = wp; return (cbp); } Index: head/sys/geom/vinum/geom_vinum_rename.c =================================================================== --- head/sys/geom/vinum/geom_vinum_rename.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_rename.c (revision 350694) @@ -1,263 +1,264 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Chris Jones * All rights reserved. * * This software was developed for the FreeBSD Project by Chris Jones * thanks to the support of Google's Summer of Code program and * mentoring by Lukas Ertl. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include +#include #include #include void gv_rename(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; char *newname, *object, *name; int *flags, type; sc = gp->softc; flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } newname = gctl_get_param(req, "newname", NULL); if (newname == NULL) { gctl_error(req, "no new name given"); return; } object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no object given"); return; } type = gv_object_type(sc, object); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, object); if (v == NULL) { gctl_error(req, "unknown volume '%s'", object); return; } name = g_malloc(GV_MAXVOLNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXVOLNAME); gv_post_event(sc, GV_EVENT_RENAME_VOL, v, name, *flags, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, object); if (p == NULL) { gctl_error(req, "unknown plex '%s'", object); return; } name = g_malloc(GV_MAXPLEXNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXPLEXNAME); gv_post_event(sc, GV_EVENT_RENAME_PLEX, p, name, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); if (s == NULL) { gctl_error(req, "unknown subdisk '%s'", object); return; } name = g_malloc(GV_MAXSDNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXSDNAME); gv_post_event(sc, GV_EVENT_RENAME_SD, s, name, *flags, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, object); if (d == NULL) { gctl_error(req, "unknown drive '%s'", object); return; } name = g_malloc(GV_MAXDRIVENAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXDRIVENAME); gv_post_event(sc, GV_EVENT_RENAME_DRIVE, d, name, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", object); return; } } int gv_rename_drive(struct gv_softc *sc, struct gv_drive *d, char *newname, int flags) { struct gv_sd *s; KASSERT(d != NULL, ("gv_rename_drive: NULL d")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "drive name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } strlcpy(d->name, newname, sizeof(d->name)); if (d->hdr != NULL) strlcpy(d->hdr->label.name, newname, sizeof(d->hdr->label.name)); LIST_FOREACH(s, &d->subdisks, from_drive) strlcpy(s->drive, d->name, sizeof(s->drive)); return (0); } int gv_rename_plex(struct gv_softc *sc, struct gv_plex *p, char *newname, int flags) { char newsd[GV_MAXSDNAME]; struct gv_sd *s; char *ptr; int err; KASSERT(p != NULL, ("gv_rename_plex: NULL p")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "plex name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } /* * Locate the plex number part of the plex names. * XXX: might be a good idea to sanitize input a bit more */ ptr = strrchr(newname, '.'); if (ptr == NULL) { G_VINUM_DEBUG(0, "proposed plex name '%s' is not a valid plex " "name", newname); return (GV_ERR_INVNAME); } strlcpy(p->name, newname, sizeof(p->name)); /* Fix up references and potentially rename subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { strlcpy(s->plex, p->name, sizeof(s->plex)); if (flags & GV_FLAG_R) { /* * Look for the two last dots in the string, and assume * that the old value was ok. */ ptr = strrchr(s->name, '.'); if (ptr == NULL) return (GV_ERR_INVNAME); ptr++; snprintf(newsd, sizeof(newsd), "%s.%s", p->name, ptr); err = gv_rename_sd(sc, s, newsd, flags); if (err) return (err); } } return (0); } /* * gv_rename_sd: renames a subdisk. Note that the 'flags' argument is ignored, * since there are no structures below a subdisk. Similarly, we don't have to * clean up any references elsewhere to the subdisk's name. */ int gv_rename_sd(struct gv_softc *sc, struct gv_sd *s, char *newname, int flags) { char *dot1, *dot2; KASSERT(s != NULL, ("gv_rename_sd: NULL s")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "subdisk name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Locate the sd number part of the sd names. */ dot1 = strchr(newname, '.'); if (dot1 == NULL || (dot2 = strchr(dot1 + 1, '.')) == NULL) { G_VINUM_DEBUG(0, "proposed sd name '%s' is not a valid sd name", newname); return (GV_ERR_INVNAME); } strlcpy(s->name, newname, sizeof(s->name)); return (0); } int gv_rename_vol(struct gv_softc *sc, struct gv_volume *v, char *newname, int flags) { struct g_provider *pp; struct gv_plex *p; char newplex[GV_MAXPLEXNAME], *ptr; int err; KASSERT(v != NULL, ("gv_rename_vol: NULL v")); pp = v->provider; KASSERT(pp != NULL, ("gv_rename_vol: NULL pp")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "volume name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Rename the volume. */ strlcpy(v->name, newname, sizeof(v->name)); /* Fix up references and potentially rename plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { strlcpy(p->volume, v->name, sizeof(p->volume)); if (flags & GV_FLAG_R) { /* * Look for the last dot in the string, and assume that * the old value was ok. */ ptr = strrchr(p->name, '.'); ptr++; snprintf(newplex, sizeof(newplex), "%s.%s", v->name, ptr); err = gv_rename_plex(sc, p, newplex, flags); if (err) return (err); } } return (0); } Index: head/sys/geom/vinum/geom_vinum_rm.c =================================================================== --- head/sys/geom/vinum/geom_vinum_rm.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_rm.c (revision 350694) @@ -1,389 +1,390 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include +#include #include #include /* General 'remove' routine. */ void gv_remove(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; int *argc, *flags; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } sc = gp->softc; /* XXX config locking */ for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); /* * If this volume has plexes, we want a recursive * removal. */ if (!LIST_EMPTY(&v->plexes) && !(*flags & GV_FLAG_R)) { gctl_error(req, "volume '%s' has attached " "plexes - need recursive removal", v->name); return; } gv_post_event(sc, GV_EVENT_RM_VOLUME, v, NULL, 0, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); /* * If this plex has subdisks, we want a recursive * removal. */ if (!LIST_EMPTY(&p->subdisks) && !(*flags & GV_FLAG_R)) { gctl_error(req, "plex '%s' has attached " "subdisks - need recursive removal", p->name); return; } /* Don't allow removal of the only plex of a volume. */ if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) { gctl_error(req, "plex '%s' is still attached " "to volume '%s'", p->name, p->volume); return; } gv_post_event(sc, GV_EVENT_RM_PLEX, p, NULL, 0, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, argv); /* Don't allow removal if attached to a plex. */ if (s->plex_sc != NULL) { gctl_error(req, "subdisk '%s' is still attached" " to plex '%s'", s->name, s->plex_sc->name); return; } gv_post_event(sc, GV_EVENT_RM_SD, s, NULL, 0, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, argv); /* We don't allow to remove open drives. */ if (gv_consumer_is_open(d->consumer) && !(*flags & GV_FLAG_F)) { gctl_error(req, "drive '%s' is open", d->name); return; } /* A drive with subdisks needs a recursive removal. */ /* if (!LIST_EMPTY(&d->subdisks) && !(*flags & GV_FLAG_R)) { gctl_error(req, "drive '%s' still has subdisks" " - need recursive removal", d->name); return; }*/ gv_post_event(sc, GV_EVENT_RM_DRIVE, d, NULL, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", argv); return; } } gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* Resets configuration */ int gv_resetconfig(struct gv_softc *sc) { struct gv_drive *d, *d2; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; /* First make sure nothing is open. */ LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { if (gv_consumer_is_open(d->consumer)) { return (GV_ERR_ISBUSY); } } /* Make sure nothing is going on internally. */ LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { if (p->flags & (GV_PLEX_REBUILDING | GV_PLEX_GROWING)) return (GV_ERR_ISBUSY); } /* Then if not, we remove everything. */ LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) gv_rm_sd(sc, s); LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) gv_rm_drive(sc, d, 0); LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) gv_rm_plex(sc, p); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) gv_rm_vol(sc, v); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } /* Remove a volume. */ void gv_rm_vol(struct gv_softc *sc, struct gv_volume *v) { struct g_provider *pp; struct gv_plex *p, *p2; KASSERT(v != NULL, ("gv_rm_vol: NULL v")); pp = v->provider; KASSERT(pp != NULL, ("gv_rm_vol: NULL pp")); /* Check if any of our consumers is open. */ if (gv_provider_is_open(pp)) { G_VINUM_DEBUG(0, "unable to remove %s: volume still in use", v->name); return; } /* Remove the plexes our volume has. */ LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2) gv_rm_plex(sc, p); /* Clean up. */ LIST_REMOVE(v, volume); g_free(v); /* Get rid of the volume's provider. */ if (pp != NULL) { g_topology_lock(); g_wither_provider(pp, ENXIO); g_topology_unlock(); } } /* Remove a plex. */ void gv_rm_plex(struct gv_softc *sc, struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s, *s2; KASSERT(p != NULL, ("gv_rm_plex: NULL p")); v = p->vol_sc; /* Check if any of our consumers is open. */ if (v != NULL && gv_provider_is_open(v->provider) && v->plexcount < 2) { G_VINUM_DEBUG(0, "unable to remove %s: volume still in use", p->name); return; } /* Remove the subdisks our plex has. */ LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) gv_rm_sd(sc, s); v = p->vol_sc; /* Clean up and let our geom fade away. */ LIST_REMOVE(p, plex); if (p->vol_sc != NULL) { p->vol_sc->plexcount--; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; /* Correctly update the volume size. */ gv_update_vol_size(v, gv_vol_size(v)); } g_free(p); } /* Remove a subdisk. */ void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s) { struct gv_plex *p; struct gv_volume *v; KASSERT(s != NULL, ("gv_rm_sd: NULL s")); p = s->plex_sc; v = NULL; /* Clean up. */ if (p != NULL) { LIST_REMOVE(s, in_plex); s->plex_sc = NULL; p->sdcount--; /* Update the plexsize. */ p->size = gv_plex_size(p); v = p->vol_sc; if (v != NULL) { /* Update the size of our plex' volume. */ gv_update_vol_size(v, gv_vol_size(v)); } } if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); LIST_REMOVE(s, sd); gv_free_sd(s); g_free(s); } /* Remove a drive. */ void gv_rm_drive(struct gv_softc *sc, struct gv_drive *d, int flags) { struct g_consumer *cp; struct gv_freelist *fl, *fl2; struct gv_plex *p; struct gv_sd *s, *s2; struct gv_volume *v; struct gv_drive *d2; int err; KASSERT(d != NULL, ("gv_rm_drive: NULL d")); cp = d->consumer; if (cp != NULL) { g_topology_lock(); err = g_access(cp, 0, 1, 0); g_topology_unlock(); if (err) { G_VINUM_DEBUG(0, "%s: unable to access '%s', " "errno: %d", __func__, cp->provider->name, err); return; } /* Clear the Vinum Magic. */ d->hdr->magic = GV_NOMAGIC; err = gv_write_header(cp, d->hdr); if (err) G_VINUM_DEBUG(0, "gv_rm_drive: error writing header to" " '%s', errno: %d", cp->provider->name, err); g_topology_lock(); g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } /* Remove all associated subdisks, plexes, volumes. */ if (flags & GV_FLAG_R) { if (!LIST_EMPTY(&d->subdisks)) { LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { p = s->plex_sc; if (p != NULL) { v = p->vol_sc; if (v != NULL) gv_rm_vol(sc, v); } } } } /* Clean up. */ LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); /* Put ourself into referenced state if we have subdisks. */ if (d->sdcount > 0) { d->consumer = NULL; d->hdr = NULL; d->flags |= GV_DRIVE_REFERENCED; snprintf(d->device, sizeof(d->device), "???"); d->size = 0; d->avail = 0; d->freelist_entries = 0; LIST_FOREACH(s, &d->subdisks, from_drive) { s->flags |= GV_SD_TASTED; gv_set_sd_state(s, GV_SD_DOWN, GV_SETSTATE_FORCE); } /* Shuffle around so we keep gv_is_newer happy. */ LIST_REMOVE(d, drive); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); return; } g_free(d); gv_save_config(sc); } Index: head/sys/geom/vinum/geom_vinum_state.c =================================================================== --- head/sys/geom/vinum/geom_vinum_state.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_state.c (revision 350694) @@ -1,536 +1,537 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include void gv_setstate(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; char *obj, *state; int f, *flags, type; f = 0; obj = gctl_get_param(req, "object", NULL); if (obj == NULL) { gctl_error(req, "no object given"); return; } state = gctl_get_param(req, "state", NULL); if (state == NULL) { gctl_error(req, "no state given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } if (*flags & GV_FLAG_F) f = GV_SETSTATE_FORCE; sc = gp->softc; type = gv_object_type(sc, obj); switch (type) { case GV_TYPE_VOL: if (gv_volstatei(state) < 0) { gctl_error(req, "invalid volume state '%s'", state); break; } v = gv_find_vol(sc, obj); gv_post_event(sc, GV_EVENT_SET_VOL_STATE, v, NULL, gv_volstatei(state), f); break; case GV_TYPE_PLEX: if (gv_plexstatei(state) < 0) { gctl_error(req, "invalid plex state '%s'", state); break; } p = gv_find_plex(sc, obj); gv_post_event(sc, GV_EVENT_SET_PLEX_STATE, p, NULL, gv_plexstatei(state), f); break; case GV_TYPE_SD: if (gv_sdstatei(state) < 0) { gctl_error(req, "invalid subdisk state '%s'", state); break; } s = gv_find_sd(sc, obj); gv_post_event(sc, GV_EVENT_SET_SD_STATE, s, NULL, gv_sdstatei(state), f); break; case GV_TYPE_DRIVE: if (gv_drivestatei(state) < 0) { gctl_error(req, "invalid drive state '%s'", state); break; } d = gv_find_drive(sc, obj); gv_post_event(sc, GV_EVENT_SET_DRIVE_STATE, d, NULL, gv_drivestatei(state), f); break; default: gctl_error(req, "unknown object '%s'", obj); break; } } /* Update drive state; return 0 if the state changes, otherwise error. */ int gv_set_drive_state(struct gv_drive *d, int newstate, int flags) { struct gv_sd *s; int oldstate; KASSERT(d != NULL, ("gv_set_drive_state: NULL d")); oldstate = d->state; if (newstate == oldstate) return (0); /* We allow to take down an open drive only with force. */ if ((newstate == GV_DRIVE_DOWN) && gv_consumer_is_open(d->consumer) && (!(flags & GV_SETSTATE_FORCE))) return (GV_ERR_ISBUSY); d->state = newstate; if (d->state != oldstate) { LIST_FOREACH(s, &d->subdisks, from_drive) gv_update_sd_state(s); } /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(d->vinumconf); return (0); } int gv_set_sd_state(struct gv_sd *s, int newstate, int flags) { struct gv_drive *d; struct gv_plex *p; int oldstate, status; KASSERT(s != NULL, ("gv_set_sd_state: NULL s")); oldstate = s->state; /* We are optimistic and assume it will work. */ status = 0; if (newstate == oldstate) return (0); switch (newstate) { case GV_SD_DOWN: /* * If we're attached to a plex, we won't go down without use of * force. */ if ((s->plex_sc != NULL) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISATTACHED); break; case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * Only do this if we're forced, since it usually is done * internally, and then we do use the force flag. */ if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); break; case GV_SD_UP: /* We can't bring the subdisk up if our drive is dead. */ d = s->drive_sc; if ((d == NULL) || (d->state != GV_DRIVE_UP)) return (GV_ERR_SETSTATE); /* Check from where we want to be brought up. */ switch (s->state) { case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * The subdisk was initializing. We allow it to be * brought up. */ break; case GV_SD_DOWN: /* * The subdisk is currently down. We allow it to be * brought up if it is not attached to a plex. */ p = s->plex_sc; if (p == NULL) break; /* * If this subdisk is attached to a plex, we allow it * to be brought up if the plex if it's not a RAID5 * plex, otherwise it's made 'stale'. */ if (p->org != GV_PLEX_RAID5) break; else if (s->flags & GV_SD_CANGOUP) { s->flags &= ~GV_SD_CANGOUP; break; } else if (flags & GV_SETSTATE_FORCE) break; else s->state = GV_SD_STALE; status = GV_ERR_SETSTATE; break; case GV_SD_STALE: /* * A stale subdisk can be brought up only if it's part * of a concat or striped plex that's the only one in a * volume, or if the subdisk isn't attached to a plex. * Otherwise it needs to be revived or initialized * first. */ p = s->plex_sc; if (p == NULL || flags & GV_SETSTATE_FORCE) break; if ((p->org != GV_PLEX_RAID5 && p->vol_sc->plexcount == 1) || (p->flags & GV_PLEX_SYNCING && p->synced > 0 && p->org == GV_PLEX_RAID5)) break; else return (GV_ERR_SETSTATE); default: return (GV_ERR_INVSTATE); } break; /* Other state transitions are only possible with force. */ default: if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); } /* We can change the state and do it. */ if (status == 0) s->state = newstate; /* Update our plex, if we're attached to one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(s->vinumconf); return (status); } int gv_set_plex_state(struct gv_plex *p, int newstate, int flags) { struct gv_volume *v; int oldstate, plexdown; KASSERT(p != NULL, ("gv_set_plex_state: NULL p")); oldstate = p->state; v = p->vol_sc; plexdown = 0; if (newstate == oldstate) return (0); switch (newstate) { case GV_PLEX_UP: /* Let update_plex handle if the plex can come up */ gv_update_plex_state(p); if (p->state != GV_PLEX_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); p->state = newstate; break; case GV_PLEX_DOWN: /* * Set state to GV_PLEX_DOWN only if no-one is using the plex, * or if the state is forced. */ if (v != NULL) { /* If the only one up, force is needed. */ plexdown = gv_plexdown(v); if ((v->plexcount == 1 || (v->plexcount - plexdown == 1)) && ((flags & GV_SETSTATE_FORCE) == 0)) return (GV_ERR_SETSTATE); } p->state = newstate; break; case GV_PLEX_DEGRADED: /* Only used internally, so we have to be forced. */ if (flags & GV_SETSTATE_FORCE) p->state = newstate; break; } /* Update our volume if we have one. */ if (v != NULL) gv_update_vol_state(v); /* Save config. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(p->vinumconf); return (0); } int gv_set_vol_state(struct gv_volume *v, int newstate, int flags) { int oldstate; KASSERT(v != NULL, ("gv_set_vol_state: NULL v")); oldstate = v->state; if (newstate == oldstate) return (0); switch (newstate) { case GV_VOL_UP: /* Let update handle if the volume can come up. */ gv_update_vol_state(v); if (v->state != GV_VOL_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); v->state = newstate; break; case GV_VOL_DOWN: /* * Set state to GV_VOL_DOWN only if no-one is using the volume, * or if the state should be forced. */ if (!gv_provider_is_open(v->provider) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISBUSY); v->state = newstate; break; } /* Save config */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(v->vinumconf); return (0); } /* Update the state of a subdisk based on its environment. */ void gv_update_sd_state(struct gv_sd *s) { struct gv_drive *d; int oldstate; KASSERT(s != NULL, ("gv_update_sd_state: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_update_sd_state: NULL d")); oldstate = s->state; /* If our drive isn't up we cannot be up either. */ if (d->state != GV_DRIVE_UP) { s->state = GV_SD_DOWN; /* If this subdisk was just created, we assume it is good.*/ } else if (s->flags & GV_SD_NEWBORN) { s->state = GV_SD_UP; s->flags &= ~GV_SD_NEWBORN; } else if (s->state != GV_SD_UP) { if (s->flags & GV_SD_CANGOUP) { s->state = GV_SD_UP; s->flags &= ~GV_SD_CANGOUP; } else s->state = GV_SD_STALE; } else s->state = GV_SD_UP; if (s->state != oldstate) G_VINUM_DEBUG(1, "subdisk %s state change: %s -> %s", s->name, gv_sdstate(oldstate), gv_sdstate(s->state)); /* Update the plex, if we have one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); } /* Update the state of a plex based on its environment. */ void gv_update_plex_state(struct gv_plex *p) { struct gv_sd *s; int sdstates; int oldstate; KASSERT(p != NULL, ("gv_update_plex_state: NULL p")); oldstate = p->state; /* First, check the state of our subdisks. */ sdstates = gv_sdstatemap(p); /* If all subdisks are up, our plex can be up, too. */ if (sdstates == GV_SD_UPSTATE) p->state = GV_PLEX_UP; /* One or more of our subdisks are down. */ else if (sdstates & GV_SD_DOWNSTATE) { /* A RAID5 plex can handle one dead subdisk. */ if ((p->org == GV_PLEX_RAID5) && (p->sddown == 1)) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; /* Some of our subdisks are initializing. */ } else if (sdstates & GV_SD_INITSTATE) { if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; } else p->state = GV_PLEX_DOWN; if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { p->state = GV_PLEX_GROWABLE; break; } } } if (p->state != oldstate) G_VINUM_DEBUG(1, "plex %s state change: %s -> %s", p->name, gv_plexstate(oldstate), gv_plexstate(p->state)); /* Update our volume, if we have one. */ if (p->vol_sc != NULL) gv_update_vol_state(p->vol_sc); } /* Update the volume state based on its plexes. */ void gv_update_vol_state(struct gv_volume *v) { struct gv_plex *p; KASSERT(v != NULL, ("gv_update_vol_state: NULL v")); /* The volume can't be up without plexes. */ if (v->plexcount == 0) { v->state = GV_VOL_DOWN; return; } LIST_FOREACH(p, &v->plexes, in_volume) { /* One of our plexes is accessible, and so are we. */ if (p->state > GV_PLEX_DEGRADED) { v->state = GV_VOL_UP; return; /* We can handle a RAID5 plex with one dead subdisk as well. */ } else if ((p->org == GV_PLEX_RAID5) && (p->state == GV_PLEX_DEGRADED)) { v->state = GV_VOL_UP; return; } } /* Not one of our plexes is up, so we can't be either. */ v->state = GV_VOL_DOWN; } /* Return a state map for the subdisks of a plex. */ int gv_sdstatemap(struct gv_plex *p) { struct gv_sd *s; int statemap; KASSERT(p != NULL, ("gv_sdstatemap: NULL p")); statemap = 0; p->sddown = 0; /* No subdisks down yet. */ LIST_FOREACH(s, &p->subdisks, in_plex) { switch (s->state) { case GV_SD_DOWN: case GV_SD_STALE: statemap |= GV_SD_DOWNSTATE; p->sddown++; /* Another unusable subdisk. */ break; case GV_SD_UP: statemap |= GV_SD_UPSTATE; break; case GV_SD_INITIALIZING: statemap |= GV_SD_INITSTATE; break; case GV_SD_REVIVING: statemap |= GV_SD_INITSTATE; p->sddown++; /* XXX: Another unusable subdisk? */ break; } } return (statemap); } Index: head/sys/geom/vinum/geom_vinum_subr.c =================================================================== --- head/sys/geom/vinum/geom_vinum_subr.c (revision 350693) +++ head/sys/geom/vinum/geom_vinum_subr.c (revision 350694) @@ -1,1283 +1,1284 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include #include int gv_drive_is_newer(struct gv_softc *, struct gv_drive *); static off_t gv_plex_smallest_sd(struct gv_plex *); void gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d) { char *aptr, *bptr, *cptr; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; int error, is_newer, tokens; char *token[GV_MAXARGS]; is_newer = gv_drive_is_newer(sc, d); /* Until the end of the string *buf. */ for (aptr = buf; *aptr != '\0'; aptr = bptr) { bptr = aptr; cptr = aptr; /* Separate input lines. */ while (*bptr != '\n') bptr++; *bptr = '\0'; bptr++; tokens = gv_tokenize(cptr, token, GV_MAXARGS); if (tokens <= 0) continue; if (!strcmp(token[0], "volume")) { v = gv_new_volume(tokens, token); if (v == NULL) { G_VINUM_DEBUG(0, "config parse failed volume"); break; } v2 = gv_find_vol(sc, v->name); if (v2 != NULL) { if (is_newer) { v2->state = v->state; G_VINUM_DEBUG(2, "newer volume found!"); } g_free(v); continue; } gv_create_volume(sc, v); } else if (!strcmp(token[0], "plex")) { p = gv_new_plex(tokens, token); if (p == NULL) { G_VINUM_DEBUG(0, "config parse failed plex"); break; } p2 = gv_find_plex(sc, p->name); if (p2 != NULL) { /* XXX */ if (is_newer) { p2->state = p->state; G_VINUM_DEBUG(2, "newer plex found!"); } g_free(p); continue; } error = gv_create_plex(sc, p); if (error) continue; /* * These flags were set in gv_create_plex() and are not * needed here (on-disk config parsing). */ p->flags &= ~GV_PLEX_ADDED; } else if (!strcmp(token[0], "sd")) { s = gv_new_sd(tokens, token); if (s == NULL) { G_VINUM_DEBUG(0, "config parse failed subdisk"); break; } s2 = gv_find_sd(sc, s->name); if (s2 != NULL) { /* XXX */ if (is_newer) { s2->state = s->state; G_VINUM_DEBUG(2, "newer subdisk found!"); } g_free(s); continue; } /* * Signal that this subdisk was tasted, and could * possibly reference a drive that isn't in our config * yet. */ s->flags |= GV_SD_TASTED; if (s->state == GV_SD_UP) s->flags |= GV_SD_CANGOUP; error = gv_create_sd(sc, s); if (error) continue; /* * This flag was set in gv_create_sd() and is not * needed here (on-disk config parsing). */ s->flags &= ~GV_SD_NEWBORN; s->flags &= ~GV_SD_GROW; } } } /* * Format the vinum configuration properly. If ondisk is non-zero then the * configuration is intended to be written to disk later. */ void gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix) { struct gv_drive *d; struct gv_sd *s; struct gv_plex *p; struct gv_volume *v; /* * We don't need the drive configuration if we're not writing the * config to disk. */ if (!ondisk) { LIST_FOREACH(d, &sc->drives, drive) { sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix, d->name, d->device); } } LIST_FOREACH(v, &sc->volumes, volume) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "volume %s", v->name); if (ondisk) sbuf_printf(sb, " state %s", gv_volstate(v->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(p, &sc->plexes, plex) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "plex name %s org %s ", p->name, gv_plexorg(p->org)); if (gv_is_striped(p)) sbuf_printf(sb, "%ds ", p->stripesize / 512); if (p->vol_sc != NULL) sbuf_printf(sb, "vol %s", p->volume); if (ondisk) sbuf_printf(sb, " state %s", gv_plexstate(p->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset " "%jds", s->name, s->drive, s->size / 512, s->drive_offset / 512); if (s->plex_sc != NULL) { sbuf_printf(sb, " plex %s plexoffset %jds", s->plex, s->plex_offset / 512); } if (ondisk) sbuf_printf(sb, " state %s", gv_sdstate(s->state)); sbuf_printf(sb, "\n"); } } static off_t gv_plex_smallest_sd(struct gv_plex *p) { struct gv_sd *s; off_t smallest; KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p")); s = LIST_FIRST(&p->subdisks); if (s == NULL) return (-1); smallest = s->size; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->size < smallest) smallest = s->size; } return (smallest); } /* Walk over plexes in a volume and count how many are down. */ int gv_plexdown(struct gv_volume *v) { int plexdown; struct gv_plex *p; KASSERT(v != NULL, ("gv_plexdown: NULL v")); plexdown = 0; LIST_FOREACH(p, &v->plexes, plex) { if (p->state == GV_PLEX_DOWN) plexdown++; } return (plexdown); } int gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p) { struct gv_sd *s2; off_t psizeorig, remainder, smallest; /* If this subdisk was already given to this plex, do nothing. */ if (s->plex_sc == p) return (0); /* Check correct size of this subdisk. */ s2 = LIST_FIRST(&p->subdisks); /* Adjust the subdisk-size if necessary. */ if (s2 != NULL && gv_is_striped(p)) { /* First adjust to the stripesize. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } smallest = gv_plex_smallest_sd(p); /* Then take off extra if other subdisks are smaller. */ remainder = s->size - smallest; /* * Don't allow a remainder below zero for running plexes, it's too * painful, and if someone were to accidentally do this, the * resulting array might be smaller than the original... not god */ if (remainder < 0) { if (!(p->flags & GV_PLEX_NEWBORN)) { G_VINUM_DEBUG(0, "sd %s too small for plex %s!", s->name, p->name); return (GV_ERR_BADSIZE); } /* Adjust other subdisks. */ LIST_FOREACH(s2, &p->subdisks, in_plex) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s2, (remainder * -1)); } } else if (remainder > 0) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } /* Find the correct plex offset for this subdisk, if needed. */ if (s->plex_offset == -1) { /* * First set it to 0 to catch the case where we had a detached * subdisk that didn't get any good offset. */ s->plex_offset = 0; if (p->sdcount) { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (gv_is_striped(p)) s->plex_offset = p->sdcount * p->stripesize; else s->plex_offset = s2->plex_offset + s2->size; } } } /* There are no subdisks for this plex yet, just insert it. */ if (LIST_EMPTY(&p->subdisks)) { LIST_INSERT_HEAD(&p->subdisks, s, in_plex); /* Insert in correct order, depending on plex_offset. */ } else { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->plex_offset < s2->plex_offset) { LIST_INSERT_BEFORE(s2, s, in_plex); break; } else if (LIST_NEXT(s2, in_plex) == NULL) { LIST_INSERT_AFTER(s2, s, in_plex); break; } } } s->plex_sc = p; /* Adjust the size of our plex. We check if the plex misses a subdisk, * so we don't make the plex smaller than it actually should be. */ psizeorig = p->size; p->size = gv_plex_size(p); /* Make sure the size is not changed. */ if (p->sddetached > 0) { if (p->size < psizeorig) { p->size = psizeorig; /* We make sure wee need another subdisk. */ if (p->sddetached == 1) p->sddetached++; } p->sddetached--; } else { if ((p->org == GV_PLEX_RAID5 || p->org == GV_PLEX_STRIPED) && !(p->flags & GV_PLEX_NEWBORN) && p->state == GV_PLEX_UP) { s->flags |= GV_SD_GROW; } p->sdcount++; } return (0); } void gv_update_vol_size(struct gv_volume *v, off_t size) { if (v == NULL) return; if (v->provider != NULL) { g_topology_lock(); v->provider->mediasize = size; g_topology_unlock(); } v->size = size; } /* Return how many subdisks that constitute the original plex. */ int gv_sdcount(struct gv_plex *p, int growing) { struct gv_sd *s; int sdcount; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } return (sdcount); } /* Calculates the plex size. */ off_t gv_plex_size(struct gv_plex *p) { struct gv_sd *s; off_t size; int sdcount; KASSERT(p != NULL, ("gv_plex_size: NULL p")); /* Adjust the size of our plex. */ size = 0; sdcount = gv_sdcount(p, 1); switch (p->org) { case GV_PLEX_CONCAT: LIST_FOREACH(s, &p->subdisks, in_plex) size += s->size; break; case GV_PLEX_STRIPED: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? (sdcount * s->size) : 0); break; case GV_PLEX_RAID5: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0); break; } return (size); } /* Returns the size of a volume. */ off_t gv_vol_size(struct gv_volume *v) { struct gv_plex *p; off_t minplexsize; KASSERT(v != NULL, ("gv_vol_size: NULL v")); p = LIST_FIRST(&v->plexes); if (p == NULL) return (0); minplexsize = p->size; LIST_FOREACH(p, &v->plexes, in_volume) { if (p->size < minplexsize) { minplexsize = p->size; } } return (minplexsize); } void gv_update_plex_config(struct gv_plex *p) { struct gv_sd *s, *s2; off_t remainder; int required_sds, state; KASSERT(p != NULL, ("gv_update_plex_config: NULL p")); /* The plex was added to an already running volume. */ if (p->flags & GV_PLEX_ADDED) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); switch (p->org) { case GV_PLEX_STRIPED: required_sds = 2; break; case GV_PLEX_RAID5: required_sds = 3; break; case GV_PLEX_CONCAT: default: required_sds = 0; break; } if (required_sds) { if (p->sdcount < required_sds) { gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } /* * The subdisks in striped plexes must all have the same size. */ s = LIST_FIRST(&p->subdisks); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->size != s2->size) { G_VINUM_DEBUG(0, "subdisk size mismatch %s" "(%jd) <> %s (%jd)", s->name, s->size, s2->name, s2->size); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } } LIST_FOREACH(s, &p->subdisks, in_plex) { /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } } p->size = gv_plex_size(p); if (p->sdcount == 0) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE); /* If added to a volume, we want the plex to be down. */ state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP; gv_set_plex_state(p, state, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->flags & GV_PLEX_ADDED) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { gv_set_plex_state(p, GV_PLEX_GROWABLE, GV_SETSTATE_FORCE); break; } } } /* Our plex is grown up now. */ p->flags &= ~GV_PLEX_NEWBORN; } /* * Give a subdisk to a drive, check and adjust several parameters, adjust * freelist. */ int gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d) { struct gv_sd *s2; struct gv_freelist *fl, *fl2; off_t tmp; int i; fl2 = NULL; /* Shortcut for "referenced" drives. */ if (d->flags & GV_DRIVE_REFERENCED) { s->drive_sc = d; return (0); } /* Check if this subdisk was already given to this drive. */ if (s->drive_sc != NULL) { if (s->drive_sc == d) { if (!(s->flags & GV_SD_TASTED)) { return (0); } } else { G_VINUM_DEBUG(0, "error giving subdisk '%s' to '%s' " "(already on '%s')", s->name, d->name, s->drive_sc->name); return (GV_ERR_ISATTACHED); } } /* Preliminary checks. */ if ((s->size > d->avail) || (d->freelist_entries == 0)) { G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name, s->name); return (GV_ERR_NOSPACE); } /* If no size was given for this subdisk, try to auto-size it... */ if (s->size == -1) { /* Find the largest available slot. */ LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; s->size = fl->size; s->drive_offset = fl->offset; fl2 = fl; } /* No good slot found? */ if (s->size == -1) { G_VINUM_DEBUG(0, "unable to autosize '%s' on '%s'", s->name, d->name); return (GV_ERR_BADSIZE); } /* * ... or check if we have a free slot that's large enough for the * given size. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; /* Assign drive offset, if not given. */ if (s->drive_offset == -1) s->drive_offset = fl->offset; fl2 = fl; i++; break; } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* No drive offset given, try to calculate it. */ if (s->drive_offset == -1) { /* Add offsets and sizes from other subdisks on this drive. */ LIST_FOREACH(s2, &d->subdisks, from_drive) { s->drive_offset = s2->drive_offset + s2->size; } /* * If there are no other subdisks yet, then set the default * offset to GV_DATA_START. */ if (s->drive_offset == -1) s->drive_offset = GV_DATA_START; /* Check if we have a free slot at the given drive offset. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { /* Yes, this subdisk fits. */ if ((fl->offset <= s->drive_offset) && (fl->offset + fl->size >= s->drive_offset + s->size)) { i++; fl2 = fl; break; } } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit " "on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* * Now that all parameters are checked and set up, we can give the * subdisk to the drive and adjust the freelist. */ /* First, adjust the freelist. */ LIST_FOREACH(fl, &d->freelist, freelist) { /* Look for the free slot that we have found before. */ if (fl != fl2) continue; /* The subdisk starts at the beginning of the free slot. */ if (fl->offset == s->drive_offset) { fl->offset += s->size; fl->size -= s->size; /* The subdisk uses the whole slot, so remove it. */ if (fl->size == 0) { d->freelist_entries--; LIST_REMOVE(fl, freelist); } /* * The subdisk does not start at the beginning of the free * slot. */ } else { tmp = fl->offset + fl->size; fl->size = s->drive_offset - fl->offset; /* * The subdisk didn't use the complete rest of the free * slot, so we need to split it. */ if (s->drive_offset + s->size != tmp) { fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO); fl2->offset = s->drive_offset + s->size; fl2->size = tmp - fl2->offset; LIST_INSERT_AFTER(fl, fl2, freelist); d->freelist_entries++; } } break; } /* * This is the first subdisk on this drive, just insert it into the * list. */ if (LIST_EMPTY(&d->subdisks)) { LIST_INSERT_HEAD(&d->subdisks, s, from_drive); /* There are other subdisks, so insert this one in correct order. */ } else { LIST_FOREACH(s2, &d->subdisks, from_drive) { if (s->drive_offset < s2->drive_offset) { LIST_INSERT_BEFORE(s2, s, from_drive); break; } else if (LIST_NEXT(s2, from_drive) == NULL) { LIST_INSERT_AFTER(s2, s, from_drive); break; } } } d->sdcount++; d->avail -= s->size; s->flags &= ~GV_SD_TASTED; /* Link back from the subdisk to this drive. */ s->drive_sc = d; return (0); } void gv_free_sd(struct gv_sd *s) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_free_sd: NULL s")); d = s->drive_sc; if (d == NULL) return; /* * First, find the free slot that's immediately before or after this * subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; if (fl->offset + fl->size == s->drive_offset) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = s->size; fl->offset = s->drive_offset; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->size += s->size; if (fl->offset > s->drive_offset) fl->offset = s->drive_offset; } d->avail += s->size; d->sdcount--; } void gv_adjust_freespace(struct gv_sd *s, off_t remainder) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_adjust_freespace: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_adjust_freespace: NULL d")); /* First, find the free slot that's immediately after this subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = remainder; fl->offset = s->drive_offset + s->size - remainder; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->offset -= remainder; fl->size += remainder; } s->size -= remainder; d->avail += remainder; } /* Check if the given plex is a striped one. */ int gv_is_striped(struct gv_plex *p) { KASSERT(p != NULL, ("gv_is_striped: NULL p")); switch(p->org) { case GV_PLEX_STRIPED: case GV_PLEX_RAID5: return (1); default: return (0); } } /* Find a volume by name. */ struct gv_volume * gv_find_vol(struct gv_softc *sc, char *name) { struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (v); } return (NULL); } /* Find a plex by name. */ struct gv_plex * gv_find_plex(struct gv_softc *sc, char *name) { struct gv_plex *p; LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (p); } return (NULL); } /* Find a subdisk by name. */ struct gv_sd * gv_find_sd(struct gv_softc *sc, char *name) { struct gv_sd *s; LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (s); } return (NULL); } /* Find a drive by name. */ struct gv_drive * gv_find_drive(struct gv_softc *sc, char *name) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (d); } return (NULL); } /* Find a drive given a device. */ struct gv_drive * gv_find_drive_device(struct gv_softc *sc, char *device) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if(!strcmp(d->device, device)) return (d); } return (NULL); } /* Check if any consumer of the given geom is open. */ int gv_consumer_is_open(struct g_consumer *cp) { if (cp == NULL) return (0); if (cp->acr || cp->acw || cp->ace) return (1); return (0); } int gv_provider_is_open(struct g_provider *pp) { if (pp == NULL) return (0); if (pp->acr || pp->acw || pp->ace) return (1); return (0); } /* * Compare the modification dates of the drives. * Return 1 if a > b, 0 otherwise. */ int gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d) { struct gv_drive *d2; struct timeval *a, *b; KASSERT(!LIST_EMPTY(&sc->drives), ("gv_is_drive_newer: empty drive list")); a = &d->hdr->label.last_update; LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->state != GV_DRIVE_UP) || (d2->hdr == NULL)) continue; b = &d2->hdr->label.last_update; if (timevalcmp(a, b, >)) return (1); } return (0); } /* Return the type of object identified by string 'name'. */ int gv_object_type(struct gv_softc *sc, char *name) { struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (GV_TYPE_VOL); } LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (GV_TYPE_PLEX); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (GV_TYPE_SD); } LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (GV_TYPE_DRIVE); } return (GV_ERR_NOTFOUND); } void gv_setup_objects(struct gv_softc *sc) { struct g_provider *pp; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; LIST_FOREACH(s, &sc->subdisks, sd) { d = gv_find_drive(sc, s->drive); if (d != NULL) gv_sd_to_drive(s, d); p = gv_find_plex(sc, s->plex); if (p != NULL) gv_sd_to_plex(s, p); gv_update_sd_state(s); } LIST_FOREACH(p, &sc->plexes, plex) { gv_update_plex_config(p); v = gv_find_vol(sc, p->volume); if (v != NULL && p->vol_sc != v) { p->vol_sc = v; v->plexcount++; LIST_INSERT_HEAD(&v->plexes, p, in_volume); } gv_update_plex_config(p); } LIST_FOREACH(v, &sc->volumes, volume) { v->size = gv_vol_size(v); if (v->provider == NULL) { g_topology_lock(); pp = g_new_providerf(sc->geom, "gvinum/%s", v->name); pp->mediasize = v->size; pp->sectorsize = 512; /* XXX */ g_error_provider(pp, 0); v->provider = pp; pp->private = v; g_topology_unlock(); } else if (v->provider->mediasize != v->size) { g_topology_lock(); v->provider->mediasize = v->size; g_topology_unlock(); } v->flags &= ~GV_VOL_NEWBORN; gv_update_vol_state(v); } } void gv_cleanup(struct gv_softc *sc) { struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_drive *d, *d2; struct gv_freelist *fl, *fl2; mtx_lock(&sc->config_mtx); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { LIST_REMOVE(v, volume); g_free(v->wqueue); g_free(v); } LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { LIST_REMOVE(p, plex); g_free(p->bqueue); g_free(p->rqueue); g_free(p->wqueue); g_free(p); } LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { LIST_REMOVE(s, sd); g_free(s); } LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); g_free(d); } mtx_destroy(&sc->config_mtx); } /* General 'attach' routine. */ int gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename) { struct gv_sd *s; struct gv_softc *sc; g_topology_assert(); sc = p->vinumconf; KASSERT(sc != NULL, ("NULL sc")); if (p->vol_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", p->name, p->volume); return (GV_ERR_ISATTACHED); } /* Stale all subdisks of this plex. */ LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_STALE) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); } /* Attach to volume. Make sure volume is not up and running. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy", p->name, v->name); return (GV_ERR_ISBUSY); } p->vol_sc = v; strlcpy(p->volume, v->name, sizeof(p->volume)); v->plexcount++; if (rename) { snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); } LIST_INSERT_HEAD(&v->plexes, p, in_volume); /* Get plex up again. */ gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); gv_save_config(p->vinumconf); return (0); } int gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) { struct gv_sd *s2; int error, sdcount; g_topology_assert(); /* If subdisk is attached, don't do it. */ if (s->plex_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", s->name, s->plex); return (GV_ERR_ISATTACHED); } gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); /* First check that this subdisk has a correct offset. If none other * starts at the same, and it's correct module stripesize, it is */ if (offset != -1 && offset % p->stripesize != 0) return (GV_ERR_BADOFFSET); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s2->plex_offset == offset) return (GV_ERR_BADOFFSET); } /* Attach the subdisk to the plex at given offset. */ s->plex_offset = offset; strlcpy(s->plex, p->name, sizeof(s->plex)); sdcount = p->sdcount; error = gv_sd_to_plex(s, p); if (error) return (error); gv_update_plex_config(p); if (rename) { snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex, p->sdcount); } if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); /* We don't update the subdisk state since the user might have to * initiate a rebuild/sync first. */ return (0); } /* Detach a plex from a volume. */ int gv_detach_plex(struct gv_plex *p, int flags) { struct gv_volume *v; g_topology_assert(); v = p->vol_sc; if (v == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", p->name); return (0); /* Not an error. */ } /* * Only proceed if forced or volume inactive. */ if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) || p->state == GV_PLEX_UP)) { G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy", p->name, p->volume); return (GV_ERR_ISBUSY); } v->plexcount--; /* Make sure someone don't read us when gone. */ v->last_read_plex = NULL; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; memset(p->volume, 0, GV_MAXVOLNAME); gv_update_vol_size(v, gv_vol_size(v)); gv_save_config(p->vinumconf); return (0); } /* Detach a subdisk from a plex. */ int gv_detach_sd(struct gv_sd *s, int flags) { struct gv_plex *p; g_topology_assert(); p = s->plex_sc; if (p == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", s->name); return (0); /* Not an error. */ } /* * Don't proceed if we're not forcing, and the plex is up, or degraded * with this subdisk up. */ if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) || ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) { G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy", s->name, s->plex); return (GV_ERR_ISBUSY); } LIST_REMOVE(s, in_plex); s->plex_sc = NULL; memset(s->plex, 0, GV_MAXPLEXNAME); p->sddetached++; gv_save_config(s->vinumconf); return (0); } Index: head/sys/geom/virstor/g_virstor.c =================================================================== --- head/sys/geom/virstor/g_virstor.c (revision 350693) +++ head/sys/geom/virstor/g_virstor.c (revision 350694) @@ -1,1894 +1,1895 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into MAXPHYS exactly, with no wasted space. */ CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; sprintf(param, "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; const char *prov_name; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); prov_name = gctl_get_asciiparam(req, aname); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", aname); g_topology_unlock(); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(prov_name); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider: '%s' (added" " %u components)", prov_name, added); update_metadata(sc); } else { gctl_error(req, "Invalid provider: '%s'", prov_name); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; sprintf(param, "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); g_attach(cp, pp); error = read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name); g_wither_provider(sc->provider, ENXIO); sc->provider = NULL; } if (c->acr > 0 || c->acw > 0 || c->ace > 0) g_access(c, -c->acr, -c->acw, -c->ace); if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); bzero(buf, pp->sectorsize); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(MAXPHYS, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = rounddown(bs, sc->sectorsize); if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (virstor_valid_components(sc) == 0) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; if (sc == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ LOG_MSG(LVL_WARNING, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) LOG_MSG(LVL_DEBUG, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; update_metadata(sc); /* Writes statistical information */ } error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector * in memory (analogous to offset) */ data_me = &sc->map[rounddown(chunk_index, sc->me_per_sector)]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ MODULE_VERSION(geom_virstor, 0); Index: head/sys/geom/virstor/g_virstor.h =================================================================== --- head/sys/geom/virstor/g_virstor.h (revision 350693) +++ head/sys/geom/virstor/g_virstor.h (revision 350694) @@ -1,137 +1,119 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_VIRSTOR_H_ #define _G_VIRSTOR_H_ #define G_VIRSTOR_CLASS_NAME "VIRSTOR" #define VIRSTOR_MAP_ALLOCATED 1 struct virstor_map_entry { uint16_t flags; uint16_t provider_no; uint32_t provider_chunk; }; #define VIRSTOR_MAP_ENTRY_SIZE (sizeof(struct virstor_map_entry)) #define VIRSTOR_MAP_BLOCK_ENTRIES (MAXPHYS / VIRSTOR_MAP_ENTRY_SIZE) /* Struct size is guarded by CTASSERT in main source */ #ifdef _KERNEL -#define LOG_MSG(lvl, ...) do { \ - if (g_virstor_debug >= (lvl)) { \ - printf("GEOM_" G_VIRSTOR_CLASS_NAME); \ - if ((lvl) > 0) \ - printf("[%u]", (lvl)); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - } \ -} while (0) +#define LOG_MSG(lvl, ...) \ + _GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), NULL, __VA_ARGS__) #define LOG_MESSAGE LOG_MSG -#define LOG_REQ(lvl, bp, ...) do { \ - if (g_virstor_debug >= (lvl)) { \ - printf("GEOM_" G_VIRSTOR_CLASS_NAME); \ - if ((lvl) > 0) \ - printf("[%u]", (lvl)); \ - printf(": "); \ - printf(__VA_ARGS__); \ - printf(" "); \ - g_print_bio(bp); \ - printf("\n"); \ - } \ -} while (0) +#define LOG_REQ(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_VIRSTOR", g_virstor_debug, (lvl), (bp), __VA_ARGS__) #define LOG_REQUEST LOG_REQ /* "critical" system announcements (e.g. "geom is up") */ #define LVL_ANNOUNCE 0 /* errors */ #define LVL_ERROR 1 /* warnings */ #define LVL_WARNING 2 /* info, noncritical for system operation (user doesn't have to see it */ #define LVL_INFO 5 /* debug info */ #define LVL_DEBUG 10 /* more debug info */ #define LVL_DEBUG2 12 /* superfluous debug info (large volumes of data) */ #define LVL_MOREDEBUG 15 /* Component data */ struct g_virstor_component { struct g_consumer *gcons; struct g_virstor_softc *sc; unsigned int index; /* Component index in array */ unsigned int chunk_count; unsigned int chunk_next; unsigned int chunk_reserved; unsigned int flags; }; /* Internal geom instance data */ struct g_virstor_softc { struct g_geom *geom; struct g_provider *provider; struct g_virstor_component *components; u_int n_components; u_int curr_component; /* Component currently used */ uint32_t id; /* Unique ID of this geom */ off_t virsize; /* Total size of virstor */ off_t sectorsize; size_t chunk_size; size_t chunk_count; /* governs map_size */ struct virstor_map_entry *map; size_t map_size; /* (in bytes) */ size_t map_sectors; /* Size of map in sectors */ size_t me_per_sector; /* # map entries in a sector */ STAILQ_HEAD(, g_virstor_bio_q) delayed_bio_q; /* Queue of delayed BIOs */ struct mtx delayed_bio_q_mtx; }; /* "delayed BIOs" Queue element */ struct g_virstor_bio_q { struct bio *bio; STAILQ_ENTRY(g_virstor_bio_q) linkage; }; #endif /* _KERNEL */ #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif #endif /* !_G_VIRSTOR_H_ */