Index: head/sys/conf/files.x86 =================================================================== --- head/sys/conf/files.x86 (revision 361274) +++ head/sys/conf/files.x86 (revision 361275) @@ -1,345 +1,346 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # This file contains all the x86 devices and such that are # common between i386 and amd64, but aren't applicable to # any other architecture we support. # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "${KEYMAP} -L ${ATKBD_DFLT_KEYMAP} | ${KEYMAP_FIX} > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" cddl/dev/fbt/x86/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" cddl/dev/dtrace/x86/dis_tables.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" cddl/dev/dtrace/x86/instr_size.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" compat/ndis/kern_ndis.c optional ndisapi pci compat/ndis/kern_windrv.c optional ndisapi pci compat/ndis/subr_hal.c optional ndisapi pci compat/ndis/subr_ndis.c optional ndisapi pci compat/ndis/subr_ntoskrnl.c optional ndisapi pci compat/ndis/subr_pe.c optional ndisapi pci compat/ndis/subr_usbd.c optional ndisapi pci crypto/aesni/aesni.c optional aesni aesni_ghash.o optional aesni \ dependency "$S/crypto/aesni/aesni_ghash.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_ghash.o" aesni_ccm.o optional aesni \ dependency "$S/crypto/aesni/aesni_ccm.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_ccm.o" aesni_wrap.o optional aesni \ dependency "$S/crypto/aesni/aesni_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_wrap.o" intel_sha1.o optional aesni \ dependency "$S/crypto/aesni/intel_sha1.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ no-implicit-rule \ clean "intel_sha1.o" intel_sha256.o optional aesni \ dependency "$S/crypto/aesni/intel_sha256.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ no-implicit-rule \ clean "intel_sha256.o" crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock dev/acpica/acpi_hpet.c optional acpi dev/acpica/acpi_if.m standard dev/acpica/acpi_pci.c optional acpi pci dev/acpica/acpi_pci_link.c optional acpi pci dev/acpica/acpi_pcib.c optional acpi pci dev/acpica/acpi_pcib_acpi.c optional acpi pci dev/acpica/acpi_pcib_pci.c optional acpi pci dev/acpica/acpi_pxm.c optional acpi dev/acpica/acpi_timer.c optional acpi dev/amdsbwd/amdsbwd.c optional amdsbwd dev/amdsmn/amdsmn.c optional amdsmn | amdtemp dev/amdtemp/amdtemp.c optional amdtemp dev/arcmsr/arcmsr.c optional arcmsr pci dev/asmc/asmc.c optional asmc isa dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/bxe/bxe.c optional bxe pci dev/bxe/bxe_stats.c optional bxe pci dev/bxe/bxe_debug.c optional bxe pci dev/bxe/ecore_sp.c optional bxe pci dev/bxe/bxe_elink.c optional bxe pci dev/bxe/57710_init_values.c optional bxe pci dev/bxe/57711_init_values.c optional bxe pci dev/bxe/57712_init_values.c optional bxe pci dev/coretemp/coretemp.c optional coretemp dev/cpuctl/cpuctl.c optional cpuctl dev/dpms/dpms.c optional dpms dev/fb/fb.c optional fb | vga dev/fb/s3_pci.c optional s3pci dev/fb/vesa.c optional vga vesa dev/fb/vga.c optional vga dev/fdc/fdc.c optional fdc dev/fdc/fdc_acpi.c optional fdc dev/fdc/fdc_isa.c optional fdc isa dev/fdc/fdc_pccard.c optional fdc pccard dev/gpio/bytgpio.c optional bytgpio dev/gpio/chvgpio.c optional chvgpio dev/hpt27xx/hpt27xx_os_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_osm_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_config.c optional hpt27xx hpt27xx_lib.o optional hpt27xx \ dependency "$S/dev/hpt27xx/$M-elf.hpt27xx_lib.o.uu" \ compile-with "uudecode < $S/dev/hpt27xx/$M-elf.hpt27xx_lib.o.uu" \ no-implicit-rule dev/hptmv/entry.c optional hptmv dev/hptmv/mv.c optional hptmv dev/hptmv/gui_lib.c optional hptmv dev/hptmv/hptproc.c optional hptmv dev/hptmv/ioctl.c optional hptmv hptmvraid.o optional hptmv \ dependency "$S/dev/hptmv/$M-elf.raid.o.uu" \ compile-with "uudecode < $S/dev/hptmv/$M-elf.raid.o.uu" \ no-implicit-rule dev/hptnr/hptnr_os_bsd.c optional hptnr dev/hptnr/hptnr_osm_bsd.c optional hptnr dev/hptnr/hptnr_config.c optional hptnr hptnr_lib.o optional hptnr \ dependency "$S/dev/hptnr/$M-elf.hptnr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptnr/$M-elf.hptnr_lib.o.uu" \ no-implicit-rule dev/hptrr/hptrr_os_bsd.c optional hptrr dev/hptrr/hptrr_osm_bsd.c optional hptrr dev/hptrr/hptrr_config.c optional hptrr hptrr_lib.o optional hptrr \ dependency "$S/dev/hptrr/$M-elf.hptrr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptrr/$M-elf.hptrr_lib.o.uu" \ no-implicit-rule dev/hwpmc/hwpmc_amd.c optional hwpmc dev/hwpmc/hwpmc_intel.c optional hwpmc dev/hwpmc/hwpmc_core.c optional hwpmc dev/hwpmc/hwpmc_uncore.c optional hwpmc dev/hwpmc/hwpmc_tsc.c optional hwpmc dev/hwpmc/hwpmc_x86.c optional hwpmc +dev/hyperv/hvsock/hv_sock.c optional hyperv dev/hyperv/input/hv_kbd.c optional hyperv dev/hyperv/input/hv_kbdc.c optional hyperv dev/hyperv/pcib/vmbus_pcib.c optional hyperv pci dev/hyperv/netvsc/hn_nvs.c optional hyperv dev/hyperv/netvsc/hn_rndis.c optional hyperv dev/hyperv/netvsc/if_hn.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv dev/hyperv/utilities/hv_snapshot.c optional hyperv dev/hyperv/utilities/vmbus_heartbeat.c optional hyperv dev/hyperv/utilities/vmbus_ic.c optional hyperv dev/hyperv/utilities/vmbus_shutdown.c optional hyperv dev/hyperv/utilities/vmbus_timesync.c optional hyperv dev/hyperv/vmbus/hyperv.c optional hyperv dev/hyperv/vmbus/hyperv_busdma.c optional hyperv dev/hyperv/vmbus/vmbus.c optional hyperv pci dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv dev/hyperv/vmbus/vmbus_res.c optional hyperv dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/ichwd/ichwd.c optional ichwd dev/if_ndis/if_ndis.c optional ndis dev/if_ndis/if_ndis_pccard.c optional ndis pccard dev/if_ndis/if_ndis_pci.c optional ndis cardbus | ndis pci dev/if_ndis/if_ndis_usb.c optional ndis usb dev/imcsmb/imcsmb.c optional imcsmb dev/imcsmb/imcsmb_pci.c optional imcsmb pci dev/intel/spi.c optional intelspi dev/io/iodev.c optional io dev/ipmi/ipmi.c optional ipmi dev/ipmi/ipmi_acpi.c optional ipmi acpi dev/ipmi/ipmi_isa.c optional ipmi isa dev/ipmi/ipmi_kcs.c optional ipmi dev/ipmi/ipmi_smic.c optional ipmi dev/ipmi/ipmi_smbus.c optional ipmi smbus dev/ipmi/ipmi_smbios.c optional ipmi dev/ipmi/ipmi_ssif.c optional ipmi smbus dev/ipmi/ipmi_pci.c optional ipmi pci dev/ipmi/ipmi_linux.c optional ipmi compat_linux32 dev/isci/isci.c optional isci dev/isci/isci_controller.c optional isci dev/isci/isci_domain.c optional isci dev/isci/isci_interrupt.c optional isci dev/isci/isci_io_request.c optional isci dev/isci/isci_logger.c optional isci dev/isci/isci_oem_parameters.c optional isci dev/isci/isci_remote_device.c optional isci dev/isci/isci_sysctl.c optional isci dev/isci/isci_task_request.c optional isci dev/isci/isci_timer.c optional isci dev/isci/scil/sati.c optional isci dev/isci/scil/sati_abort_task_set.c optional isci dev/isci/scil/sati_atapi.c optional isci dev/isci/scil/sati_device.c optional isci dev/isci/scil/sati_inquiry.c optional isci dev/isci/scil/sati_log_sense.c optional isci dev/isci/scil/sati_lun_reset.c optional isci dev/isci/scil/sati_mode_pages.c optional isci dev/isci/scil/sati_mode_select.c optional isci dev/isci/scil/sati_mode_sense.c optional isci dev/isci/scil/sati_mode_sense_10.c optional isci dev/isci/scil/sati_mode_sense_6.c optional isci dev/isci/scil/sati_move.c optional isci dev/isci/scil/sati_passthrough.c optional isci dev/isci/scil/sati_read.c optional isci dev/isci/scil/sati_read_buffer.c optional isci dev/isci/scil/sati_read_capacity.c optional isci dev/isci/scil/sati_reassign_blocks.c optional isci dev/isci/scil/sati_report_luns.c optional isci dev/isci/scil/sati_request_sense.c optional isci dev/isci/scil/sati_start_stop_unit.c optional isci dev/isci/scil/sati_synchronize_cache.c optional isci dev/isci/scil/sati_test_unit_ready.c optional isci dev/isci/scil/sati_unmap.c optional isci dev/isci/scil/sati_util.c optional isci dev/isci/scil/sati_verify.c optional isci dev/isci/scil/sati_write.c optional isci dev/isci/scil/sati_write_and_verify.c optional isci dev/isci/scil/sati_write_buffer.c optional isci dev/isci/scil/sati_write_long.c optional isci dev/isci/scil/sci_abstract_list.c optional isci dev/isci/scil/sci_base_controller.c optional isci dev/isci/scil/sci_base_domain.c optional isci dev/isci/scil/sci_base_iterator.c optional isci dev/isci/scil/sci_base_library.c optional isci dev/isci/scil/sci_base_logger.c optional isci dev/isci/scil/sci_base_memory_descriptor_list.c optional isci dev/isci/scil/sci_base_memory_descriptor_list_decorator.c optional isci dev/isci/scil/sci_base_object.c optional isci dev/isci/scil/sci_base_observer.c optional isci dev/isci/scil/sci_base_phy.c optional isci dev/isci/scil/sci_base_port.c optional isci dev/isci/scil/sci_base_remote_device.c optional isci dev/isci/scil/sci_base_request.c optional isci dev/isci/scil/sci_base_state_machine.c optional isci dev/isci/scil/sci_base_state_machine_logger.c optional isci dev/isci/scil/sci_base_state_machine_observer.c optional isci dev/isci/scil/sci_base_subject.c optional isci dev/isci/scil/sci_util.c optional isci dev/isci/scil/scic_sds_controller.c optional isci dev/isci/scil/scic_sds_library.c optional isci dev/isci/scil/scic_sds_pci.c optional isci dev/isci/scil/scic_sds_phy.c optional isci dev/isci/scil/scic_sds_port.c optional isci dev/isci/scil/scic_sds_port_configuration_agent.c optional isci dev/isci/scil/scic_sds_remote_device.c optional isci dev/isci/scil/scic_sds_remote_node_context.c optional isci dev/isci/scil/scic_sds_remote_node_table.c optional isci dev/isci/scil/scic_sds_request.c optional isci dev/isci/scil/scic_sds_sgpio.c optional isci dev/isci/scil/scic_sds_smp_remote_device.c optional isci dev/isci/scil/scic_sds_smp_request.c optional isci dev/isci/scil/scic_sds_ssp_request.c optional isci dev/isci/scil/scic_sds_stp_packet_request.c optional isci dev/isci/scil/scic_sds_stp_remote_device.c optional isci dev/isci/scil/scic_sds_stp_request.c optional isci dev/isci/scil/scic_sds_unsolicited_frame_control.c optional isci dev/isci/scil/scif_sas_controller.c optional isci dev/isci/scil/scif_sas_controller_state_handlers.c optional isci dev/isci/scil/scif_sas_controller_states.c optional isci dev/isci/scil/scif_sas_domain.c optional isci dev/isci/scil/scif_sas_domain_state_handlers.c optional isci dev/isci/scil/scif_sas_domain_states.c optional isci dev/isci/scil/scif_sas_high_priority_request_queue.c optional isci dev/isci/scil/scif_sas_internal_io_request.c optional isci dev/isci/scil/scif_sas_io_request.c optional isci dev/isci/scil/scif_sas_io_request_state_handlers.c optional isci dev/isci/scil/scif_sas_io_request_states.c optional isci dev/isci/scil/scif_sas_library.c optional isci dev/isci/scil/scif_sas_remote_device.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substates.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substates.c optional isci dev/isci/scil/scif_sas_remote_device_state_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_states.c optional isci dev/isci/scil/scif_sas_request.c optional isci dev/isci/scil/scif_sas_smp_activity_clear_affiliation.c optional isci dev/isci/scil/scif_sas_smp_io_request.c optional isci dev/isci/scil/scif_sas_smp_phy.c optional isci dev/isci/scil/scif_sas_smp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_io_request.c optional isci dev/isci/scil/scif_sas_stp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_task_request.c optional isci dev/isci/scil/scif_sas_task_request.c optional isci dev/isci/scil/scif_sas_task_request_state_handlers.c optional isci dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci dev/itwd/itwd.c optional itwd libkern/x86/crc32_sse42.c standard # # x86 shared code between IA32 and AMD64 architectures # x86/acpica/OsdEnvironment.c optional acpi x86/acpica/acpi_apm.c optional acpi x86/acpica/acpi_wakeup.c optional acpi x86/acpica/srat.c optional acpi x86/bios/smbios.c optional smbios x86/bios/vpd.c optional vpd x86/cpufreq/est.c optional cpufreq x86/cpufreq/hwpstate_amd.c optional cpufreq x86/cpufreq/hwpstate_intel.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq x86/cpufreq/powernow.c optional cpufreq x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci x86/iommu/intel_ctx.c optional acpi acpi_dmar pci x86/iommu/intel_drv.c optional acpi acpi_dmar pci x86/iommu/intel_fault.c optional acpi acpi_dmar pci x86/iommu/intel_gas.c optional acpi acpi_dmar pci x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci x86/iommu/intel_intrmap.c optional acpi acpi_dmar pci x86/iommu/intel_qi.c optional acpi acpi_dmar pci x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atrtc.c standard x86/isa/clock.c standard x86/isa/isa.c optional isa x86/isa/isa_dma.c optional isa x86/isa/nmi.c standard x86/isa/orm.c optional isa x86/pci/pci_bus.c optional pci x86/pci/qpi.c optional pci x86/x86/autoconf.c standard x86/x86/bus_machdep.c standard x86/x86/busdma_bounce.c standard x86/x86/busdma_machdep.c standard x86/x86/cpu_machdep.c standard x86/x86/dump_machdep.c standard x86/x86/fdt_machdep.c optional fdt x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/legacy.c standard x86/x86/mca.c standard x86/x86/x86_mem.c optional mem x86/x86/mp_x86.c optional smp x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/nexus.c standard x86/x86/pvclock.c standard x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/ucode.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm x86/xen/xen_intr.c optional xenhvm x86/xen/xen_apic.c optional xenhvm x86/xen/xenpv.c optional xenhvm x86/xen/xen_msi.c optional xenhvm x86/xen/xen_nexus.c optional xenhvm Index: head/sys/dev/hyperv/hvsock/hv_sock.c =================================================================== --- head/sys/dev/hyperv/hvsock/hv_sock.c (nonexistent) +++ head/sys/dev/hyperv/hvsock/hv_sock.c (revision 361275) @@ -0,0 +1,1748 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "hv_sock.h" + +#define HVSOCK_DBG_NONE 0x0 +#define HVSOCK_DBG_INFO 0x1 +#define HVSOCK_DBG_ERR 0x2 +#define HVSOCK_DBG_VERBOSE 0x3 + + +SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); + +static int hvs_dbg_level; +SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, + 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); + + +#define HVSOCK_DBG(level, ...) do { \ + if (hvs_dbg_level >= (level)) \ + printf(__VA_ARGS__); \ + } while (0) + +MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); + +/* The MTU is 16KB per host side's design */ +#define HVSOCK_MTU_SIZE (1024 * 16) +#define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) + +#define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) + +#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ + roundup2(payload_len, 8) + \ + sizeof(uint64_t)) + + +static struct domain hv_socket_domain; + +/* + * HyperV Transport sockets + */ +static struct pr_usrreqs hvs_trans_usrreqs = { + .pru_attach = hvs_trans_attach, + .pru_bind = hvs_trans_bind, + .pru_listen = hvs_trans_listen, + .pru_accept = hvs_trans_accept, + .pru_connect = hvs_trans_connect, + .pru_peeraddr = hvs_trans_peeraddr, + .pru_sockaddr = hvs_trans_sockaddr, + .pru_soreceive = hvs_trans_soreceive, + .pru_sosend = hvs_trans_sosend, + .pru_disconnect = hvs_trans_disconnect, + .pru_close = hvs_trans_close, + .pru_detach = hvs_trans_detach, + .pru_shutdown = hvs_trans_shutdown, + .pru_abort = hvs_trans_abort, +}; + +/* + * Definitions of protocols supported in HyperV socket domain + */ +static struct protosw hv_socket_protosw[] = { +{ + .pr_type = SOCK_STREAM, + .pr_domain = &hv_socket_domain, + .pr_protocol = HYPERV_SOCK_PROTO_TRANS, + .pr_flags = PR_CONNREQUIRED, + .pr_init = hvs_trans_init, + .pr_usrreqs = &hvs_trans_usrreqs, +}, +}; + +static struct domain hv_socket_domain = { + .dom_family = AF_HYPERV, + .dom_name = "hyperv", + .dom_protosw = hv_socket_protosw, + .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] +}; + +VNET_DOMAIN_SET(hv_socket_); + +#define MAX_PORT ((uint32_t)0xFFFFFFFF) +#define MIN_PORT ((uint32_t)0x0) + +/* 00000000-facb-11e6-bd58-64006a7986d3 */ +static const struct hyperv_guid srv_id_template = { + .hv_guid = { + 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, + 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } +}; + +static int hvsock_br_callback(void *, int, void *); +static uint32_t hvsock_canread_check(struct hvs_pcb *); +static uint32_t hvsock_canwrite_check(struct hvs_pcb *); +static int hvsock_send_data(struct vmbus_channel *chan, + struct uio *uio, uint32_t to_write, struct sockbuf *sb); + + + +/* Globals */ +static struct sx hvs_trans_socks_sx; +static struct mtx hvs_trans_socks_mtx; +static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; +static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; +static uint32_t previous_auto_bound_port; + +static void +hvsock_print_guid(struct hyperv_guid *guid) +{ + unsigned char *p = (unsigned char *)guid; + + HVSOCK_DBG(HVSOCK_DBG_INFO, + "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", + *(unsigned int *)p, + *((unsigned short *) &p[4]), + *((unsigned short *) &p[6]), + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); +} + +static bool +is_valid_srv_id(const struct hyperv_guid *id) +{ + return !memcmp(&id->hv_guid[4], + &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); +} + +static unsigned int +get_port_by_srv_id(const struct hyperv_guid *srv_id) +{ + return *((const unsigned int *)srv_id); +} + +static void +set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) +{ + *((unsigned int *)srv_id) = port; +} + + +static void +__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + if (!pcb) + return; + + if (list & HVS_LIST_BOUND) { + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p == pcb) + LIST_REMOVE(p, bound_next); + } + + if (list & HVS_LIST_CONNECTED) { + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p == pcb) + LIST_REMOVE(pcb, connected_next); + } +} + +static void +__hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + __hvs_remove_pcb_from_list(pcb, list); +} + +static void +__hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + if (list & HVS_LIST_BOUND) + LIST_INSERT_HEAD(&hvs_trans_bound_socks, + pcb, bound_next); + + if (list & HVS_LIST_CONNECTED) + LIST_INSERT_HEAD(&hvs_trans_connected_socks, + pcb, connected_next); +} + +void +hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static void +hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_insert_socket_on_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static struct socket * +__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + if (list & HVS_LIST_BOUND) + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + if (list & HVS_LIST_CONNECTED) + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + return NULL; +} + +static struct socket * +hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct socket *s = NULL; + + mtx_lock(&hvs_trans_socks_mtx); + s = __hvs_find_socket_on_list(addr, list); + mtx_unlock(&hvs_trans_socks_mtx); + + return s; +} + +static inline void +hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) +{ + memset(addr, 0, sizeof(*addr)); + addr->sa_family = AF_HYPERV; + addr->hvs_port = port; +} + +void +hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) +{ + hvs_addr_set(addr, get_port_by_srv_id(svr_id)); +} + +int +hvs_trans_lock(void) +{ + sx_xlock(&hvs_trans_socks_sx); + return (0); +} + +void +hvs_trans_unlock(void) +{ + sx_xunlock(&hvs_trans_socks_sx); +} + +void +hvs_trans_init(void) +{ + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + if (vm_guest != VM_GUEST_HV) + return; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_init called\n", __func__); + + /* Initialize Globals */ + previous_auto_bound_port = MAX_PORT; + sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); + mtx_init(&hvs_trans_socks_mtx, + "hvs_trans_socks_mtx", NULL, MTX_DEF); + LIST_INIT(&hvs_trans_bound_socks); + LIST_INIT(&hvs_trans_connected_socks); +} + +/* + * Called in two cases: + * 1) When user calls socket(); + * 2) When we accept new incoming conneciton and call sonewconn(). + */ +int +hvs_trans_attach(struct socket *so, int proto, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_attach called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (ESOCKTNOSUPPORT); + + if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) + return (EPROTONOSUPPORT); + + if (pcb != NULL) + return (EISCONN); + pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); + if (pcb == NULL) + return (ENOMEM); + + pcb->so = so; + so->so_pcb = (void *)pcb; + + return (0); +} + +void +hvs_trans_detach(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_detach called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + bzero(pcb, sizeof(*pcb)); + free(pcb, M_HVSOCK); + } + + so->so_pcb = NULL; + + hvs_trans_unlock(); +} + +int +hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_bind called\n", __func__); + + if (sa == NULL) { + return (EINVAL); + } + + if (pcb == NULL) { + return (EINVAL); + } + + if (sa->sa_family != AF_HYPERV) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Not supported, sa_family is %u\n", + __func__, sa->sa_family); + return (EAFNOSUPPORT); + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: binding port = 0x%x\n", __func__, sa->hvs_port); + + mtx_lock(&hvs_trans_socks_mtx); + if (__hvs_find_socket_on_list(sa, + HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { + error = EADDRINUSE; + } else { + /* + * The address is available for us to bind. + * Add socket to the bound list. + */ + hvs_addr_set(&pcb->local_addr, sa->hvs_port); + hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + } + mtx_unlock(&hvs_trans_socks_mtx); + + return (error); +} + +int +hvs_trans_listen(struct socket *so, int backlog, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct socket *bound_so; + int error; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_listen called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* Check if the address is already bound and it was by us. */ + bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); + if (bound_so == NULL || bound_so != so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Address not bound or not by us.\n", __func__); + return (EADDRNOTAVAIL); + } + + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0) + solisten_proto(so, backlog); + SOCK_UNLOCK(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket listen error = %d\n", __func__, error); + return (error); +} + +int +hvs_trans_accept(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_accept called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, + M_NOWAIT); + + return ((*nam == NULL) ? ENOMEM : 0); +} + +int +hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; + bool found_auto_bound_port = false; + int i, error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", + __func__, raddr->hvs_port); + + if (pcb == NULL) + return (EINVAL); + + /* Verify the remote address */ + if (raddr == NULL) + return (EINVAL); + if (raddr->sa_family != AF_HYPERV) + return (EAFNOSUPPORT); + + mtx_lock(&hvs_trans_socks_mtx); + if (so->so_state & + (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket connect in progress\n", + __func__); + error = EINPROGRESS; + goto out; + } + + /* + * Find an available port for us to auto bind the local + * address. + */ + hvs_addr_set(&pcb->local_addr, 0); + + for (i = previous_auto_bound_port - 1; + i != previous_auto_bound_port; i --) { + if (i == MIN_PORT) + i = MAX_PORT; + + pcb->local_addr.hvs_port = i; + + if (__hvs_find_socket_on_list(&pcb->local_addr, + HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { + found_auto_bound_port = true; + previous_auto_bound_port = i; + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: found local bound port is %x\n", + __func__, pcb->local_addr.hvs_port); + break; + } + } + + if (found_auto_bound_port == true) { + /* Found available port for auto bound, put on list */ + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + /* Set VM service ID */ + pcb->vm_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); + /* Set host service ID and remote port */ + pcb->host_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); + hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); + + /* Change the socket state to SS_ISCONNECTING */ + soisconnecting(so); + } else { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: No local port available for auto bound\n", + __func__); + error = EADDRINUSE; + } + + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); + hvsock_print_guid(&pcb->vm_srv_id); + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); + hvsock_print_guid(&pcb->host_srv_id); + +out: + mtx_unlock(&hvs_trans_socks_mtx); + + if (found_auto_bound_port == true) + vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); + + return (error); +} + +int +hvs_trans_disconnect(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return (EINVAL); + } + + /* If socket is already disconnected, skip this */ + if ((so->so_state & SS_ISDISCONNECTED) == 0) + soisdisconnecting(so); + + hvs_trans_unlock(); + + return (0); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) +struct hvs_callback_arg { + struct uio *uio; + struct sockbuf *sb; +}; + +int +hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canread, to_read; + int flags, error = 0; + struct hvs_callback_arg cbarg; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + + if (flags & MSG_PEEK) + return (EOPNOTSUPP); + + /* If no space to copy out anything */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) + return (EINVAL); + + sb = &so->so_rcv; + + orig_resid = uio->uio_resid; + + /* Prevent other readers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: sblock returned error = %d\n", __func__, error); + return (error); + } + + SOCKBUF_LOCK(sb); + + cbarg.uio = uio; + cbarg.sb = sb; + /* + * If the socket is closing, there might still be some data + * in rx br to read. However we need to make sure + * the channel is still open. + */ + if ((sb->sb_state & SBS_CANTRCVMORE) && + (so->so_state & SS_ISDISCONNECTED)) { + /* Other thread already closed the channel */ + error = EPIPE; + goto out; + } + + while (true) { + while (uio->uio_resid > 0 && + (canread = hvsock_canread_check(pcb)) > 0) { + to_read = MIN(canread, uio->uio_resid); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: to_read = %u, skip = %u\n", __func__, to_read, + (unsigned int)(sizeof(struct hvs_pkt_header) + + pcb->recv_data_off)); + + error = vmbus_chan_recv_peek_call(pcb->chan, to_read, + sizeof(struct hvs_pkt_header) + pcb->recv_data_off, + hvsock_br_callback, (void *)&cbarg); + /* + * It is possible socket is disconnected becasue + * we released lock in hvsock_br_callback. So we + * need to check the state to make sure it is not + * disconnected. + */ + if (error || so->so_state & SS_ISDISCONNECTED) { + break; + } + + pcb->recv_data_len -= to_read; + pcb->recv_data_off += to_read; + } + + if (error) + break; + + /* Abort if socket has reported problems. */ + if (so->so_error) { + if (so->so_error == ESHUTDOWN && + orig_resid > uio->uio_resid) { + /* + * Although we got a FIN, we also received + * some data in this round. Delivery it + * to user. + */ + error = 0; + } else { + if (so->so_error != ESHUTDOWN) + error = so->so_error; + } + + break; + } + + /* Cannot received more. */ + if (sb->sb_state & SBS_CANTRCVMORE) + break; + + /* We are done if buffer has been filled */ + if (uio->uio_resid == 0) + break; + + if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) + break; + + /* Buffer ring is empty and we shall not block */ + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { + if (orig_resid == uio->uio_resid) { + /* We have not read anything */ + error = EAGAIN; + } + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: non blocked read return, error %d.\n", + __func__, error); + break; + } + + /* + * Wait and block until (more) data comes in. + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, read available is %u\n", + __func__, vmbus_chan_read_available(pcb->chan)); + } + +out: + SOCKBUF_UNLOCK(sb); + + sbunlock(sb); + + /* We recieved a FIN in this call */ + if (so->so_error == ESHUTDOWN) { + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + /* Send has already closed */ + soisdisconnecting(so); + } else { + /* Just close the receive side */ + socantrcvmore(so); + } + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: returning error = %d, so_error = %d\n", + __func__, error, so->so_error); + + return (error); +} + +int +hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canwrite, to_write; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %lu\n", + __func__, uio->uio_resid); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + /* If nothing to send */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) + return (EINVAL); + + sb = &so->so_snd; + + orig_resid = uio->uio_resid; + + /* Prevent other writers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: sblock returned error = %d\n", __func__, error); + return (error); + } + + SOCKBUF_LOCK(sb); + + if ((sb->sb_state & SBS_CANTSENDMORE) || + so->so_error == ESHUTDOWN) { + error = EPIPE; + goto out; + } + + while (uio->uio_resid > 0) { + canwrite = hvsock_canwrite_check(pcb); + if (canwrite == 0) { + /* We have sent some data */ + if (orig_resid > uio->uio_resid) + break; + /* + * We have not sent any data and it is + * non-blocked io + */ + if (so->so_state & SS_NBIO || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { + error = EWOULDBLOCK; + break; + } else { + /* + * We are here because there is no space on + * send buffer ring. Signal the other side + * to read and free more space. + * Sleep wait until space avaiable to send + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, space avail on " + "tx ring is %u\n", + __func__, + vmbus_chan_write_available(pcb->chan)); + + continue; + } + } + to_write = MIN(canwrite, uio->uio_resid); + to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite is %u, to_write = %u\n", __func__, + canwrite, to_write); + error = hvsock_send_data(pcb->chan, uio, to_write, sb); + + if (error) + break; + } + +out: + SOCKBUF_UNLOCK(sb); + sbunlock(sb); + + return (error); +} + +int +hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +int +hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +void +hvs_trans_close(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_close called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (!pcb) { + hvs_trans_unlock(); + return; + } + + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hvs_trans_close sending a FIN to host\n", __func__); + (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); + } + + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + + pcb->chan = NULL; + pcb->so = NULL; + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + hvs_trans_unlock(); + + return; +} + +void +hvs_trans_abort(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_abort called\n", __func__); + + (void) hvs_trans_lock(); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + if (so->so_state & SS_ISCONNECTED) { + (void) sodisconnect(so); + } + hvs_trans_unlock(); + + return; +} + +int +hvs_trans_shutdown(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* + * Only get called with the shutdown method is SHUT_WR or + * SHUT_RDWR. + * When the method is SHUT_RD or SHUT_RDWR, the caller + * already set the SBS_CANTRCVMORE on receive side socket + * buffer. + */ + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { + /* + * SHUT_WR only case. + * Receive side is still open. Just close + * the send side. + */ + socantsendmore(so); + } else { + /* SHUT_RDWR case */ + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + (void) hvsock_send_data(pcb->chan, NULL, 0, sb); + SOCKBUF_UNLOCK(sb); + + soisdisconnecting(so); + } + } + + return (0); +} + +/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is + * (see struct sockaddr_hvs). + * + * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- + * guide/make-integration-service, and the endpoint is with + * the below sockaddr: + * + * struct SOCKADDR_HV + * { + * ADDRESS_FAMILY Family; + * USHORT Reserved; + * GUID VmId; + * GUID ServiceId; + * }; + * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via + * VMBus, because here it's obvious the host and the VM can easily identify + * each other. Though the VmID is useful on the host, especially in the case + * of Windows container, FreeBSD VM doesn't need it at all. + * + * To be compatible with similar infrastructure in Linux VMs, we have + * to limit the available GUID space of SOCKADDR_HV so that we can create + * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. + * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: + * + **************************************************************************** + * The only valid Service GUIDs, from the perspectives of both the host and * + * FreeBSD VM, that can be connected by the other end, must conform to this * + * format: -facb-11e6-bd58-64006a7986d3. * + **************************************************************************** + * + * When we write apps on the host to connect(), the GUID ServiceID is used. + * When we write apps in FreeBSD VM to connect(), we only need to specify the + * port and the driver will form the GUID and use that to request the host. + * + * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the + * auto-generated remote port for a connect request initiated by the host's + * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the + * FreeBSD guest. + */ + +/* + * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) + * restricts HyperV socket ring buffer size to six 4K pages. Newer + * HyperV hosts doen't have this limit. + */ +#define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) + +struct hvsock_sc { + device_t dev; + struct hvs_pcb *pcb; + struct vmbus_channel *channel; +}; + +static bool +hvsock_chan_readable(struct vmbus_channel *chan) +{ + uint32_t readable = vmbus_chan_read_available(chan); + + return (readable >= HVSOCK_PKT_LEN(0)); +} + +static void +hvsock_chan_cb(struct vmbus_channel *chan, void *context) +{ + struct hvs_pcb *pcb = (struct hvs_pcb *) context; + struct socket *so; + uint32_t canwrite; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: host send us a wakeup on rb data, pcb = %p\n", + __func__, pcb); + + /* + * Check if the socket is still attached and valid. + * Here we know channel is still open. Need to make + * sure the socket has not been closed or freed. + */ + (void) hvs_trans_lock(); + so = hsvpcb2so(pcb); + + if (pcb->chan != NULL && so != NULL) { + /* + * Wake up reader if there are data to read. + */ + SOCKBUF_LOCK(&(so)->so_rcv); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: read available = %u\n", __func__, + vmbus_chan_read_available(pcb->chan)); + + if (hvsock_chan_readable(pcb->chan)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&(so)->so_rcv); + + /* + * Wake up sender if space becomes available to write. + */ + SOCKBUF_LOCK(&(so)->so_snd); + canwrite = hvsock_canwrite_check(pcb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite = %u\n", __func__, canwrite); + + if (canwrite > 0) { + sowwakeup_locked(so); + } else { + SOCKBUF_UNLOCK(&(so)->so_snd); + } + } + + hvs_trans_unlock(); + + return; +} + +static int +hvsock_br_callback(void *datap, int cplen, void *cbarg) +{ + struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; + struct uio *uio = arg->uio; + struct sockbuf *sb = arg->sb; + int error = 0; + + if (cbarg == NULL || datap == NULL) + return (EINVAL); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: called, uio_rw = %s, uio_resid = %lu, cplen = %u, " + "datap = %p\n", + __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", + uio->uio_resid, cplen, datap); + + if (sb) + SOCKBUF_UNLOCK(sb); + + error = uiomove(datap, cplen, uio); + + if (sb) + SOCKBUF_LOCK(sb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after uiomove, uio_resid = %lu, error = %d\n", + __func__, uio->uio_resid, error); + + return (error); +} + +static int +hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, + uint32_t to_write, struct sockbuf *sb) +{ + struct hvs_pkt_header hvs_pkt; + int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; + uint64_t pad = 0; + struct iovec iov[3]; + struct hvs_callback_arg cbarg; + + if (chan == NULL) + return (ENOTCONN); + + hlen = sizeof(struct vmbus_chanpkt_hdr); + hvs_pkthlen = sizeof(struct hvs_pkt_header); + hvs_pktlen = hvs_pkthlen + to_write; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " + "pad_pktlen = %u, data_len = %u\n", + __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); + + hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; + hvs_pkt.chan_pkt_hdr.cph_flags = 0; + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); + hvs_pkt.chan_pkt_hdr.cph_xactid = 0; + + hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; + hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; + + cbarg.uio = uio; + cbarg.sb = sb; + + if (uio && to_write > 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = NULL; + iov[1].iov_len = to_write; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - hvs_pktlen; + + error = vmbus_chan_iov_send(chan, iov, 3, + hvsock_br_callback, &cbarg); + } else { + if (to_write == 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = &pad; + iov[1].iov_len = pad_pktlen - hvs_pktlen; + error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); + } + } + + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: error = %d\n", __func__, error); + } + + return (error); +} + +/* + * Check if we have data on current ring buffer to read + * or not. If not, advance the ring buffer read index to + * next packet. Update the recev_data_len and recev_data_off + * to new value. + * Return the number of bytes can read. + */ +static uint32_t +hvsock_canread_check(struct hvs_pcb *pcb) +{ + uint32_t advance; + uint32_t tlen, hlen, dlen; + uint32_t bytes_canread = 0; + int error; + + if (pcb == NULL || pcb->chan == NULL) { + pcb->so->so_error = EIO; + return (0); + } + + /* Still have data not read yet on current packet */ + if (pcb->recv_data_len > 0) + return (pcb->recv_data_len); + + if (pcb->rb_init) + advance = + VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + else + advance = 0; + + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: bytes_canread on br = %u, advance = %u\n", + __func__, bytes_canread, advance); + + if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { + /* + * Nothing to read. Need to advance the rindex before + * calling sbwait, so host knows to wake us up when data + * is available to read on rb. + */ + error = vmbus_chan_recv_idxadv(pcb->chan, advance); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: after calling vmbus_chan_recv_idxadv, " + "got error = %d\n", __func__, error); + return (0); + } else { + pcb->rb_init = false; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: advanced %u bytes, " + " bytes_canread on br now = %u\n", + __func__, advance, bytes_canread); + + if (bytes_canread == 0) + return (0); + else + advance = 0; + } + } + + if (bytes_canread < + advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) + return (0); + + error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, + sizeof(struct hvs_pkt_header), advance); + + /* Don't have anything to read */ + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after calling vmbus_chan_recv_peek, got error = %d\n", + __func__, error); + return (0); + } + + /* + * We just read in a new packet header. Do some sanity checks. + */ + tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); + dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; + if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || + __predict_false(hlen > tlen) || + __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "invalid tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + pcb->so->so_error = EIO; + return (0); + } + if (pcb->rb_init == false) + pcb->rb_init = true; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + + /* The other side has sent a close FIN */ + if (dlen == 0) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: Received FIN from other side\n", __func__); + /* inform the caller by seting so_error to ESHUTDOWN */ + pcb->so->so_error = ESHUTDOWN; + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canread on receive ring is %u \n", __func__, dlen); + + pcb->recv_data_len = dlen; + pcb->recv_data_off = 0; + + return (pcb->recv_data_len); +} + +static uint32_t +hvsock_canwrite_check(struct hvs_pcb *pcb) +{ + uint32_t writeable; + uint32_t ret; + + if (pcb == NULL || pcb->chan == NULL) + return (0); + + writeable = vmbus_chan_write_available(pcb->chan); + + /* + * We must always reserve a 0-length-payload packet for the FIN. + */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: writeable is %u, should be greater than %lu\n", + __func__, writeable, HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)); + + if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { + /* + * The Tx ring seems full. + */ + return (0); + } + + ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: available size is %u\n", __func__, rounddown2(ret, 8)); + + return (rounddown2(ret, 8)); +} + +static void +hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) +{ + vmbus_chan_set_pending_send_size(chan, + HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); +} + +static int +hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) +{ + unsigned int rcvbuf, sndbuf; + struct hvs_pcb *pcb = so2hvspcb(so); + int ret; + + if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { + sndbuf = HVS_RINGBUF_SND_SIZE; + rcvbuf = HVS_RINGBUF_RCV_SIZE; + } else { + sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); + sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); + sndbuf = rounddown2(sndbuf, PAGE_SIZE); + rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); + rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); + rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); + } + + /* + * Can only read whatever user provided size of data + * from ring buffer. Turn off batched reading. + */ + vmbus_chan_set_readbatch(chan, false); + + ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, + hvsock_chan_cb, pcb); + + if (ret != 0) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: failed to open hvsock channel, sndbuf = %u, " + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + } else { + HVSOCK_DBG(HVSOCK_DBG_INFO, + "%s: hvsock channel opened, sndbuf = %u, i" + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + /* + * Se the pending send size so to receive wakeup + * signals from host when there is enough space on + * rx buffer ring to write. + */ + hvsock_set_chan_pending_send_size(chan); + } + + return ret; +} + +/* + * Guest is listening passively on the socket. Open channel and + * create a new socket for the conneciton. + */ +static void +hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, + struct hvsock_sc *sc) +{ + struct socket *new_so; + struct hvs_pcb *new_pcb, *pcb; + int error; + + /* Do nothing if socket is not listening */ + if ((so->so_options & SO_ACCEPTCONN) == 0) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket is not a listening one\n", __func__); + return; + } + + /* + * Create a new socket. This will call pru_attach to complete + * the socket initialization and put the new socket onto + * listening socket's sol_incomp list, waiting to be promoted + * to sol_comp list. + * The new socket created has ref count 0. There is no other + * thread that changes the state of this new one at the + * moment, so we don't need to hold its lock while opening + * channel and filling out its pcb information. + */ + new_so = sonewconn(so, 0); + if (!new_so) + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: creating new socket failed\n", __func__); + + /* + * Now open the vmbus channel. If it fails, the socket will be + * on the listening socket's sol_incomp queue until it is + * replaced and aborted. + */ + error = hvsock_open_channel(chan, new_so); + if (error) { + new_so->so_error = error; + return; + } + + pcb = so->so_pcb; + new_pcb = new_so->so_pcb; + + hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); + /* Remote port is unknown to guest in this type of conneciton */ + hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); + new_pcb->chan = chan; + new_pcb->recv_data_len = 0; + new_pcb->recv_data_off = 0; + new_pcb->rb_init = false; + + new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); + new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); + + hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); + + sc->pcb = new_pcb; + + /* + * Change the socket state to SS_ISCONNECTED. This will promote + * the socket to sol_comp queue and wake up the thread which + * is accepting connection. + */ + soisconnected(new_so); +} + + +/* + * Guest is actively connecting to host. + */ +static void +hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) +{ + struct hvs_pcb *pcb; + int error; + + error = hvsock_open_channel(chan, so); + if (error) { + so->so_error = error; + return; + } + + pcb = so->so_pcb; + pcb->chan = chan; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + pcb->rb_init = false; + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Change the socket state to SS_ISCONNECTED. This will wake up + * the thread sleeping in connect call. + */ + soisconnected(so); +} + +static void +hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) +{ + struct hyperv_guid *inst_guid, *type_guid; + bool conn_from_host; + struct sockaddr_hvs addr; + struct socket *so; + struct hvs_pcb *pcb; + + type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); + inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); + conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); + + HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); + hvsock_print_guid(type_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); + hvsock_print_guid(inst_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", + (conn_from_host == true ) ? "from" : "to"); + + /* + * The listening port should be in [0, MAX_LISTEN_PORT] + */ + if (!is_valid_srv_id(type_guid)) + return; + + /* + * There should be a bound socket already created no matter + * it is a passive or active connection. + * For host initiated connection (passive on guest side), + * the type_guid contains the port which guest is bound and + * listening. + * For the guest initiated connection (active on guest side), + * the inst_guid contains the port that guest has auto bound + * to. + */ + hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); + so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); + if (!so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: no bound socket found for port %u\n", + __func__, addr.hvs_port); + return; + } + + if (conn_from_host) { + hvsock_open_conn_passive(chan, so, sc); + } else { + (void) hvs_trans_lock(); + pcb = so->so_pcb; + if (pcb && pcb->so) { + sc->pcb = so2hvspcb(so); + hvsock_open_conn_active(chan, so); + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: channel detached before open\n", __func__); + } + hvs_trans_unlock(); + } + +} + +static int +hvsock_probe(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + + if (!channel || !vmbus_chan_is_hvs(channel)) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe called but not a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return ENXIO; + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe got a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return BUS_PROBE_DEFAULT; + } +} + +static int +hvsock_attach(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); + + hvsock_open_connection(channel, sc); + + /* + * Always return success. On error the host will rescind the device + * in 30 seconds and we can do cleanup at that time in + * vmbus_chan_msgproc_chrescind(). + */ + return (0); +} + +static int +hvsock_detach(device_t dev) +{ + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + struct socket *so; + int error, retry; + + if (bootverbose) + device_printf(dev, "hvsock_detach called.\n"); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); + + if (sc->pcb != NULL) { + (void) hvs_trans_lock(); + + so = hsvpcb2so(sc->pcb); + if (so) { + /* Close the connection */ + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_pcb_from_list(sc->pcb, + HVS_LIST_BOUND | HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Close channel while no reader and sender are working + * on the buffer rings. + */ + if (so) { + retry = 0; + while ((error = sblock(&so->so_rcv, 0)) == + EWOULDBLOCK) { + /* + * Someone is reading, rx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for rx reader to exit, " + "retry = %d\n", retry++); + } + retry = 0; + while ((error = sblock(&so->so_snd, 0)) == + EWOULDBLOCK) { + /* + * Someone is sending, tx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for tx sender to exit, " + "retry = %d\n", retry++); + } + } + + + bzero(sc->pcb, sizeof(struct hvs_pcb)); + free(sc->pcb, M_HVSOCK); + sc->pcb = NULL; + + if (so) { + sbunlock(&so->so_rcv); + sbunlock(&so->so_snd); + so->so_pcb = NULL; + } + + hvs_trans_unlock(); + } + + vmbus_chan_close(vmbus_get_channel(dev)); + + return (0); +} + +static device_method_t hvsock_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hvsock_probe), + DEVMETHOD(device_attach, hvsock_attach), + DEVMETHOD(device_detach, hvsock_detach), + DEVMETHOD_END +}; + +static driver_t hvsock_driver = { + "hv_sock", + hvsock_methods, + sizeof(struct hvsock_sc) +}; + +static devclass_t hvsock_devclass; + +DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); +MODULE_VERSION(hvsock, 1); +MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); Property changes on: head/sys/dev/hyperv/hvsock/hv_sock.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/dev/hyperv/hvsock/hv_sock.h =================================================================== --- head/sys/dev/hyperv/hvsock/hv_sock.h (nonexistent) +++ head/sys/dev/hyperv/hvsock/hv_sock.h (revision 361275) @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HVSOCK_H +#define _HVSOCK_H +#include +#include +#include + +#include +#include + +/* + * HyperV Socket Protocols + */ +#define HYPERV_SOCK_PROTO_TRANS 1 /* Transport protocol */ + +#define HVADDR_PORT_ANY -1U +#define HVADDR_PORT_UNKNOWN -1U + +#define HVS_LIST_BOUND 0x01 +#define HVS_LIST_CONNECTED 0x02 +#define HVS_LIST_ALL (HVS_LIST_BOUND | HVS_LIST_CONNECTED) + +struct sockaddr_hvs { + unsigned char sa_len; + sa_family_t sa_family; + unsigned int hvs_port; + unsigned char hvs_zero[sizeof(struct sockaddr) - + sizeof(sa_family_t) - + sizeof(unsigned char) - + sizeof(unsigned int)]; +}; + +struct vmpipe_proto_header { + uint32_t vmpipe_pkt_type; + uint32_t vmpipe_data_size; +} __packed; + +struct hvs_pkt_header { + struct vmbus_chanpkt_hdr chan_pkt_hdr; + struct vmpipe_proto_header vmpipe_pkt_hdr; +} __packed; + +struct hvs_pcb { + struct socket *so; /* Pointer to socket */ + struct sockaddr_hvs local_addr; + struct sockaddr_hvs remote_addr; + + struct hyperv_guid vm_srv_id; + struct hyperv_guid host_srv_id; + + struct vmbus_channel *chan; + /* Current packet header on rx ring */ + struct hvs_pkt_header hvs_pkt; + /* Available data in receive br in current packet */ + uint32_t recv_data_len; + /* offset in the packet */ + uint32_t recv_data_off; + bool rb_init; + /* Link lists for global bound and connected sockets */ + LIST_ENTRY(hvs_pcb) bound_next; + LIST_ENTRY(hvs_pcb) connected_next; +}; + +#define so2hvspcb(so) \ + ((struct hvs_pcb *)((so)->so_pcb)) +#define hsvpcb2so(hvspcb) \ + ((struct socket *)((hvspcb)->so)) + +void hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *); +void hvs_trans_init(void); +void hvs_trans_close(struct socket *); +void hvs_trans_detach(struct socket *); +void hvs_trans_abort(struct socket *); +int hvs_trans_attach(struct socket *, int, struct thread *); +int hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *); +int hvs_trans_listen(struct socket *, int, struct thread *); +int hvs_trans_accept(struct socket *, struct sockaddr **); +int hvs_trans_connect(struct socket *, + struct sockaddr *, struct thread *); +int hvs_trans_peeraddr(struct socket *, struct sockaddr **); +int hvs_trans_sockaddr(struct socket *, struct sockaddr **); +int hvs_trans_soreceive(struct socket *, struct sockaddr **, + struct uio *, struct mbuf **, struct mbuf **, int *); +int hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *, + struct mbuf *, struct mbuf *, int, struct thread *); +int hvs_trans_disconnect(struct socket *); +int hvs_trans_shutdown(struct socket *); + +int hvs_trans_lock(void); +void hvs_trans_unlock(void); + +void hvs_remove_socket_from_list(struct socket *, unsigned char); +#endif /* _HVSOCK_H */ Property changes on: head/sys/dev/hyperv/hvsock/hv_sock.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/dev/hyperv/include/vmbus.h =================================================================== --- head/sys/dev/hyperv/include/vmbus.h (revision 361274) +++ head/sys/dev/hyperv/include/vmbus.h (revision 361275) @@ -1,234 +1,261 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_H_ #define _VMBUS_H_ #include #include +#include /* * VMBUS version is 32 bit, upper 16 bit for major_number and lower * 16 bit for minor_number. * * 0.13 -- Windows Server 2008 * 1.1 -- Windows 7 * 2.4 -- Windows 8 * 3.0 -- Windows 8.1 * 4.0 -- Windows 10 * 5.0 -- Newer Windows 10 */ #define VMBUS_VERSION_WS2008 ((0 << 16) | (13)) #define VMBUS_VERSION_WIN7 ((1 << 16) | (1)) #define VMBUS_VERSION_WIN8 ((2 << 16) | (4)) #define VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) #define VMBUS_VERSION_WIN10 ((4 << 16) | (0)) #define VMBUS_VERSION_WIN10_V5 ((5 << 16) | (0)) #define VMBUS_VERSION_MAJOR(ver) (((uint32_t)(ver)) >> 16) #define VMBUS_VERSION_MINOR(ver) (((uint32_t)(ver)) & 0xffff) #define VMBUS_CHAN_POLLHZ_MIN 100 /* 10ms interval */ #define VMBUS_CHAN_POLLHZ_MAX 1000000 /* 1us interval */ /* * GPA stuffs. */ struct vmbus_gpa_range { uint32_t gpa_len; uint32_t gpa_ofs; uint64_t gpa_page[0]; } __packed; /* This is actually vmbus_gpa_range.gpa_page[1] */ struct vmbus_gpa { uint32_t gpa_len; uint32_t gpa_ofs; uint64_t gpa_page; } __packed; #define VMBUS_CHANPKT_SIZE_SHIFT 3 #define VMBUS_CHANPKT_GETLEN(pktlen) \ (((int)(pktlen)) << VMBUS_CHANPKT_SIZE_SHIFT) struct vmbus_chanpkt_hdr { uint16_t cph_type; /* VMBUS_CHANPKT_TYPE_ */ uint16_t cph_hlen; /* header len, in 8 bytes */ uint16_t cph_tlen; /* total len, in 8 bytes */ uint16_t cph_flags; /* VMBUS_CHANPKT_FLAG_ */ uint64_t cph_xactid; } __packed; #define VMBUS_CHANPKT_TYPE_INBAND 0x0006 #define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 #define VMBUS_CHANPKT_TYPE_GPA 0x0009 #define VMBUS_CHANPKT_TYPE_COMP 0x000b #define VMBUS_CHANPKT_FLAG_NONE 0 #define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ #define VMBUS_CHANPKT_CONST_DATA(pkt) \ (const void *)((const uint8_t *)(pkt) + \ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) /* Include padding */ #define VMBUS_CHANPKT_DATALEN(pkt) \ (VMBUS_CHANPKT_GETLEN((pkt)->cph_tlen) -\ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) struct vmbus_rxbuf_desc { uint32_t rb_len; uint32_t rb_ofs; } __packed; struct vmbus_chanpkt_rxbuf { struct vmbus_chanpkt_hdr cp_hdr; uint16_t cp_rxbuf_id; uint16_t cp_rsvd; uint32_t cp_rxbuf_cnt; struct vmbus_rxbuf_desc cp_rxbuf[]; } __packed; struct vmbus_chan_br { void *cbr; bus_addr_t cbr_paddr; int cbr_txsz; int cbr_rxsz; }; struct vmbus_channel; struct vmbus_xact; struct vmbus_xact_ctx; struct hyperv_guid; struct task; struct taskqueue; typedef void (*vmbus_chan_callback_t)(struct vmbus_channel *, void *); +typedef int (*vmbus_br_copy_callback_t)(void *, int, void *); static __inline struct vmbus_channel * vmbus_get_channel(device_t dev) { return device_get_ivars(dev); } /* * vmbus_chan_open_br() * * Return values: * 0 Succeeded. * EISCONN Failed, and the memory passed through 'br' is still * connected. Callers must _not_ free the the memory * passed through 'br', if this error happens. * other values Failed. The memory passed through 'br' is no longer * connected. Callers are free to do anything with the * memory passed through 'br'. * * * * vmbus_chan_close_direct() * * NOTE: * Callers of this function _must_ make sure to close all sub-channels before * closing the primary channel. * * Return values: * 0 Succeeded. * EISCONN Failed, and the memory associated with the bufring * is still connected. Callers must _not_ free the the * memory associated with the bufring, if this error * happens. * other values Failed. The memory associated with the bufring is * no longer connected. Callers are free to do anything * with the memory associated with the bufring. */ int vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg); int vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg); void vmbus_chan_close(struct vmbus_channel *chan); int vmbus_chan_close_direct(struct vmbus_channel *chan); void vmbus_chan_intr_drain(struct vmbus_channel *chan); void vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task); void vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *); void vmbus_chan_unset_orphan(struct vmbus_channel *chan); const void *vmbus_chan_xact_wait(const struct vmbus_channel *chan, struct vmbus_xact *xact, size_t *resp_len, bool can_sleep); int vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl); int vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl); void vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu); void vmbus_chan_cpu_rr(struct vmbus_channel *chan); void vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on); struct vmbus_channel ** vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt); void vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt); void vmbus_subchan_drain(struct vmbus_channel *pri_chan); int vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen, uint64_t *xactid); int vmbus_chan_recv_pkt(struct vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt, int *pktlen); +int vmbus_chan_recv_idxadv(struct vmbus_channel *chan, + uint32_t advance); +int vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance); +int vmbus_chan_recv_peek_call(struct vmbus_channel *chan, + int data_len, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg); + int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid); int vmbus_chan_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid); int vmbus_chan_send_prplist(struct vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid); +int vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg); +uint32_t vmbus_chan_write_available(struct vmbus_channel *chan); +uint32_t vmbus_chan_read_available(struct vmbus_channel *chan); +bool vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size); +void vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size); uint32_t vmbus_chan_id(const struct vmbus_channel *chan); uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan); bool vmbus_chan_is_primary(const struct vmbus_channel *chan); bool vmbus_chan_is_revoked(const struct vmbus_channel *chan); -const struct hyperv_guid * - vmbus_chan_guid_inst(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs_conn_from_host( + const struct vmbus_channel *chan); +int vmbus_req_tl_connect(struct hyperv_guid *, + struct hyperv_guid *); + +struct hyperv_guid * + vmbus_chan_guid_type(struct vmbus_channel *chan); +struct hyperv_guid * + vmbus_chan_guid_inst(struct vmbus_channel *chan); int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max); bool vmbus_chan_rx_empty(const struct vmbus_channel *chan); bool vmbus_chan_tx_empty(const struct vmbus_channel *chan); struct taskqueue * vmbus_chan_mgmt_tq(const struct vmbus_channel *chan); void vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz); void vmbus_chan_poll_disable(struct vmbus_channel *chan); #endif /* !_VMBUS_H_ */ Index: head/sys/dev/hyperv/vmbus/vmbus.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus.c (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus.c (revision 361275) @@ -1,1557 +1,1593 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * VM Bus Driver Implementation */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "pcib_if.h" #include "vmbus_if.h" #define VMBUS_GPADL_START 0xe1e10 struct vmbus_msghc { struct vmbus_xact *mh_xact; struct hypercall_postmsg_in mh_inprm_save; }; static void vmbus_identify(driver_t *, device_t); static int vmbus_probe(device_t); static int vmbus_attach(device_t); static int vmbus_detach(device_t); static int vmbus_read_ivar(device_t, device_t, int, uintptr_t *); static int vmbus_child_pnpinfo_str(device_t, device_t, char *, size_t); static struct resource *vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs); static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs); static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq); static int vmbus_release_msix(device_t bus, device_t dev, int irq); static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data); static uint32_t vmbus_get_version_method(device_t, device_t); static int vmbus_probe_guid_method(device_t, device_t, const struct hyperv_guid *); static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu); static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t, int); #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *); #endif static int vmbus_init(struct vmbus_softc *); static int vmbus_connect(struct vmbus_softc *, uint32_t); static int vmbus_req_channels(struct vmbus_softc *sc); static void vmbus_disconnect(struct vmbus_softc *); static int vmbus_scan(struct vmbus_softc *); static void vmbus_scan_teardown(struct vmbus_softc *); static void vmbus_scan_done(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chanmsg_handle(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_msg_task(void *, int); static void vmbus_synic_setup(void *); static void vmbus_synic_teardown(void *); static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS); static int vmbus_dma_alloc(struct vmbus_softc *); static void vmbus_dma_free(struct vmbus_softc *); static int vmbus_intr_setup(struct vmbus_softc *); static void vmbus_intr_teardown(struct vmbus_softc *); static int vmbus_doattach(struct vmbus_softc *); static void vmbus_event_proc_dummy(struct vmbus_softc *, int); static struct vmbus_softc *vmbus_sc; SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V vmbus"); static int vmbus_pin_evttask = 1; SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN, &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU"); extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti); uint32_t vmbus_current_version; static const uint32_t vmbus_version[] = { VMBUS_VERSION_WIN10, VMBUS_VERSION_WIN8_1, VMBUS_VERSION_WIN8, VMBUS_VERSION_WIN7, VMBUS_VERSION_WS2008 }; static const vmbus_chanmsg_proc_t vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done), VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP) }; static device_method_t vmbus_methods[] = { /* Device interface */ DEVMETHOD(device_identify, vmbus_identify), DEVMETHOD(device_probe, vmbus_probe), DEVMETHOD(device_attach, vmbus_attach), DEVMETHOD(device_detach, vmbus_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_read_ivar, vmbus_read_ivar), DEVMETHOD(bus_child_pnpinfo_str, vmbus_child_pnpinfo_str), DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), #if __FreeBSD_version >= 1100000 DEVMETHOD(bus_get_cpus, bus_generic_get_cpus), #endif /* pcib interface */ DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_release_msix), DEVMETHOD(pcib_map_msi, vmbus_map_msi), /* Vmbus interface */ DEVMETHOD(vmbus_get_version, vmbus_get_version_method), DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method), DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method), DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method), DEVMETHOD_END }; static driver_t vmbus_driver = { "vmbus", vmbus_methods, sizeof(struct vmbus_softc) }; static devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL); DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass, NULL, NULL); MODULE_DEPEND(vmbus, acpi, 1, 1, 1); MODULE_DEPEND(vmbus, pci, 1, 1, 1); MODULE_VERSION(vmbus, 1); static __inline struct vmbus_softc * vmbus_get_softc(void) { return vmbus_sc; } void vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) { struct hypercall_postmsg_in *inprm; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); inprm = vmbus_xact_req_data(mh->mh_xact); memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); inprm->hc_connid = VMBUS_CONNID_MESSAGE; inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; inprm->hc_dsize = dsize; } struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) { struct vmbus_msghc *mh; struct vmbus_xact *xact; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); xact = vmbus_xact_get(sc->vmbus_xc, dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); if (xact == NULL) return (NULL); mh = vmbus_xact_priv(xact, sizeof(*mh)); mh->mh_xact = xact; vmbus_msghc_reset(mh, dsize); return (mh); } void vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_put(mh->mh_xact); } void * vmbus_msghc_dataptr(struct vmbus_msghc *mh) { struct hypercall_postmsg_in *inprm; inprm = vmbus_xact_req_data(mh->mh_xact); return (inprm->hc_data); } int vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) { sbintime_t time = SBT_1MS; struct hypercall_postmsg_in *inprm; bus_addr_t inprm_paddr; int i; inprm = vmbus_xact_req_data(mh->mh_xact); inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); /* * Save the input parameter so that we could restore the input * parameter if the Hypercall failed. * * XXX * Is this really necessary?! i.e. Will the Hypercall ever * overwrite the input parameter? */ memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); /* * In order to cope with transient failures, e.g. insufficient * resources on host side, we retry the post message Hypercall * several times. 20 retries seem sufficient. */ #define HC_RETRY_MAX 20 for (i = 0; i < HC_RETRY_MAX; ++i) { uint64_t status; status = hypercall_post_message(inprm_paddr); if (status == HYPERCALL_STATUS_SUCCESS) return 0; pause_sbt("hcpmsg", time, 0, C_HARDCLOCK); if (time < SBT_1S * 2) time *= 2; /* Restore input parameter and try again */ memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); } #undef HC_RETRY_MAX return EIO; } int vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { int error; vmbus_xact_activate(mh->mh_xact); error = vmbus_msghc_exec_noresult(mh); if (error) vmbus_xact_deactivate(mh->mh_xact); return error; } void vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_deactivate(mh->mh_xact); } const struct vmbus_message * vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_wait(mh->mh_xact, &resp_len)); } const struct vmbus_message * vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_poll(mh->mh_xact, &resp_len)); } void vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); } uint32_t vmbus_gpadl_alloc(struct vmbus_softc *sc) { uint32_t gpadl; again: - gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); + gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); if (gpadl == 0) goto again; return (gpadl); +} + +/* Used for Hyper-V socket when guest client connects to host */ +int +vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id, + struct hyperv_guid *host_srv_id) +{ + struct vmbus_softc *sc = vmbus_get_softc(); + struct vmbus_chanmsg_tl_connect *req; + struct vmbus_msghc *mh; + int error; + + if (!sc) + return ENXIO; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + device_printf(sc->vmbus_dev, + "can not get msg hypercall for tl connect\n"); + return ENXIO; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN; + req->guest_endpoint_id = *guest_srv_id; + req->host_service_id = *host_srv_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + device_printf(sc->vmbus_dev, + "tl connect msg hypercall failed\n"); + } + + return error; } static int vmbus_connect(struct vmbus_softc *sc, uint32_t version) { struct vmbus_chanmsg_connect *req; const struct vmbus_message *msg; struct vmbus_msghc *mh; int error, done = 0; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT; req->chm_ver = version; req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr; req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr; req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); return error; } msg = vmbus_msghc_wait_result(sc, mh); done = ((const struct vmbus_chanmsg_connect_resp *) msg->msg_data)->chm_done; vmbus_msghc_put(sc, mh); return (done ? 0 : EOPNOTSUPP); } static int vmbus_init(struct vmbus_softc *sc) { int i; for (i = 0; i < nitems(vmbus_version); ++i) { int error; error = vmbus_connect(sc, vmbus_version[i]); if (!error) { vmbus_current_version = vmbus_version[i]; sc->vmbus_version = vmbus_version[i]; device_printf(sc->vmbus_dev, "version %u.%u\n", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return 0; } } return ENXIO; } static void vmbus_disconnect(struct vmbus_softc *sc) { struct vmbus_chanmsg_disconnect *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for disconnect\n"); return; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "disconnect msg hypercall failed\n"); } } static int vmbus_req_channels(struct vmbus_softc *sc) { struct vmbus_chanmsg_chrequest *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); return error; } static void vmbus_scan_done_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; mtx_lock(&Giant); sc->vmbus_scandone = true; mtx_unlock(&Giant); wakeup(&sc->vmbus_scandone); } static void vmbus_scan_done(struct vmbus_softc *sc, const struct vmbus_message *msg __unused) { taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); } static int vmbus_scan(struct vmbus_softc *sc) { int error; /* * Identify, probe and attach for non-channel devices. */ bus_generic_probe(sc->vmbus_dev); bus_generic_attach(sc->vmbus_dev); /* * This taskqueue serializes vmbus devices' attach and detach * for channel offer and rescind messages. */ sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_devtq); taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); /* * This taskqueue handles sub-channel detach, so that vmbus * device's detach running in vmbus_devtq can drain its sub- * channels. */ sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_subchtq); taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); /* * Start vmbus scanning. */ error = vmbus_req_channels(sc); if (error) { device_printf(sc->vmbus_dev, "channel request failed: %d\n", error); return (error); } /* * Wait for all vmbus devices from the initial channel offers to be * attached. */ GIANT_REQUIRED; while (!sc->vmbus_scandone) mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0); if (bootverbose) { device_printf(sc->vmbus_dev, "device scan, probe and attach " "done\n"); } return (0); } static void vmbus_scan_teardown(struct vmbus_softc *sc) { GIANT_REQUIRED; if (sc->vmbus_devtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_devtq); mtx_lock(&Giant); sc->vmbus_devtq = NULL; } if (sc->vmbus_subchtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_subchtq); mtx_lock(&Giant); sc->vmbus_subchtq = NULL; } } static void vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) { device_printf(sc->vmbus_dev, "unknown message type 0x%x\n", msg_type); return; } msg_proc = vmbus_chanmsg_handlers[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); /* Channel specific processing */ vmbus_chan_msgproc(sc, msg); } static void vmbus_msg_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; volatile struct vmbus_message *msg; msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE; for (;;) { if (msg->msg_type == HYPERV_MSGTYPE_NONE) { /* No message */ break; } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) { /* Channel message */ vmbus_chanmsg_handle(sc, __DEVOLATILE(const struct vmbus_message *, msg)); } msg->msg_type = HYPERV_MSGTYPE_NONE; /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } } static __inline int vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu) { volatile struct vmbus_message *msg; struct vmbus_message *msg_base; msg_base = VMBUS_PCPU_GET(sc, message, cpu); /* * Check event timer. * * TODO: move this to independent IDT vector. */ msg = msg_base + VMBUS_SINT_TIMER; if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) { msg->msg_type = HYPERV_MSGTYPE_NONE; vmbus_et_intr(frame); /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } /* * Check events. Hot path for network and storage I/O data; high rate. * * NOTE: * As recommended by the Windows guest fellows, we check events before * checking messages. */ sc->vmbus_event_proc(sc, cpu); /* * Check messages. Mainly management stuffs; ultra low rate. */ msg = msg_base + VMBUS_SINT_MESSAGE; if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); } return (FILTER_HANDLED); } void vmbus_handle_intr(struct trapframe *trap_frame) { struct vmbus_softc *sc = vmbus_get_softc(); int cpu = curcpu; /* * Disable preemption. */ critical_enter(); /* * Do a little interrupt counting. */ (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++; vmbus_handle_intr1(sc, trap_frame, cpu); /* * Enable preemption. */ critical_exit(); } static void vmbus_synic_setup(void *xsc) { struct vmbus_softc *sc = xsc; int cpu = curcpu; uint64_t val, orig; uint32_t sint; if (hyperv_features & CPUID_HV_MSR_VP_INDEX) { /* Save virtual processor id. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX); } else { /* Set virtual processor id to 0 for compatibility. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0; } /* * Setup the SynIC message. */ orig = rdmsr(MSR_HV_SIMP); val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIMP_PGSHIFT); wrmsr(MSR_HV_SIMP, val); /* * Setup the SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT); wrmsr(MSR_HV_SIEFP, val); /* * Configure and unmask SINT for message and event flags. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * Configure and unmask SINT for timer. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * All done; enable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK); wrmsr(MSR_HV_SCONTROL, val); } static void vmbus_synic_teardown(void *arg) { uint64_t orig; uint32_t sint; /* * Disable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK)); /* * Mask message and event flags SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Mask timer SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Teardown SynIC message. */ orig = rdmsr(MSR_HV_SIMP); wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK)); /* * Teardown SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK)); } static int vmbus_dma_alloc(struct vmbus_softc *sc) { bus_dma_tag_t parent_dtag; uint8_t *evtflags; int cpu; parent_dtag = bus_get_dma_tag(sc->vmbus_dev); CPU_FOREACH(cpu) { void *ptr; /* * Per-cpu messages and event flags. */ ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, message, cpu) = ptr; ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr; } evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (evtflags == NULL) return ENOMEM; sc->vmbus_rx_evtflags = (u_long *)evtflags; sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2)); sc->vmbus_evtflags = evtflags; sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf1 == NULL) return ENOMEM; sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf2 == NULL) return ENOMEM; return 0; } static void vmbus_dma_free(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_evtflags != NULL) { hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags); sc->vmbus_evtflags = NULL; sc->vmbus_rx_evtflags = NULL; sc->vmbus_tx_evtflags = NULL; } if (sc->vmbus_mnf1 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1); sc->vmbus_mnf1 = NULL; } if (sc->vmbus_mnf2 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2); sc->vmbus_mnf2 = NULL; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, message_dma, cpu), VMBUS_PCPU_GET(sc, message, cpu)); VMBUS_PCPU_GET(sc, message, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), VMBUS_PCPU_GET(sc, event_flags, cpu)); VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL; } } } static int vmbus_intr_setup(struct vmbus_softc *sc) { int cpu; CPU_FOREACH(cpu) { char buf[MAXCOMLEN + 1]; cpuset_t cpu_mask; /* Allocate an interrupt counter for Hyper-V interrupt */ snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu); intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu)); /* * Setup taskqueue to handle events. Task will be per- * channel. */ VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast( "hyperv event", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, event_tq, cpu)); if (vmbus_pin_evttask) { CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, &cpu_mask, "hvevent%d", cpu); } else { taskqueue_start_threads( VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, "hvevent%d", cpu); } /* * Setup tasks and taskqueues to handle messages. */ VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast( "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, message_tq, cpu)); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask, "hvmsg%d", cpu); TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0, vmbus_msg_task, sc); } /* * All Hyper-V ISR required resources are setup, now let's find a * free IDT vector for Hyper-V ISR and set it up. */ sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) : IDTVEC(vmbus_isr)); if (sc->vmbus_idtvec < 0) { device_printf(sc->vmbus_dev, "cannot find free IDT vector\n"); return ENXIO; } if (bootverbose) { device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n", sc->vmbus_idtvec); } return 0; } static void vmbus_intr_teardown(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_idtvec >= 0) { lapic_ipi_free(sc->vmbus_idtvec); sc->vmbus_idtvec = -1; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) { taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu)); VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) { taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu)); VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL; } } } static int vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { return (ENOENT); } static int vmbus_child_pnpinfo_str(device_t dev, device_t child, char *buf, size_t buflen) { const struct vmbus_channel *chan; char guidbuf[HYPERV_GUID_STRLEN]; chan = vmbus_get_channel(child); if (chan == NULL) { /* Event timer device, which does not belong to a channel */ return (0); } strlcat(buf, "classid=", buflen); hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); strlcat(buf, " deviceid=", buflen); hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); return (0); } int vmbus_add_child(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; device_t parent = sc->vmbus_dev; mtx_lock(&Giant); chan->ch_dev = device_add_child(parent, NULL, -1); if (chan->ch_dev == NULL) { mtx_unlock(&Giant); device_printf(parent, "device_add_child for chan%u failed\n", chan->ch_id); return (ENXIO); } device_set_ivars(chan->ch_dev, chan); device_probe_and_attach(chan->ch_dev); mtx_unlock(&Giant); return (0); } int vmbus_delete_child(struct vmbus_channel *chan) { int error = 0; mtx_lock(&Giant); if (chan->ch_dev != NULL) { error = device_delete_child(chan->ch_vmbus->vmbus_dev, chan->ch_dev); chan->ch_dev = NULL; } mtx_unlock(&Giant); return (error); } static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS) { struct vmbus_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } /* * We need the function to make sure the MMIO resource is allocated from the * ranges found in _CRS. * * For the release function, we can use bus_generic_release_resource(). */ static struct resource * vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { device_t parent = device_get_parent(dev); struct resource *res; #ifdef NEW_PCIB if (type == SYS_RES_MEMORY) { struct vmbus_softc *sc = device_get_softc(dev); res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type, rid, start, end, count, flags); } else #endif { res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start, end, count, flags); } return (res); } static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs) { return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, irqs)); } static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs) { return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); } static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq) { return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_release_msix(device_t bus, device_t dev, int irq) { return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data)); } static uint32_t vmbus_get_version_method(device_t bus, device_t dev) { struct vmbus_softc *sc = device_get_softc(bus); return sc->vmbus_version; } static int vmbus_probe_guid_method(device_t bus, device_t dev, const struct hyperv_guid *guid) { const struct vmbus_channel *chan = vmbus_get_channel(dev); if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0) return 0; return ENXIO; } static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); return (VMBUS_PCPU_GET(sc, vcpuid, cpu)); } static struct taskqueue * vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu)); return (VMBUS_PCPU_GET(sc, event_tq, cpu)); } #ifdef NEW_PCIB #define VTPM_BASE_ADDR 0xfed40000 #define FOUR_GB (1ULL << 32) enum parse_pass { parse_64, parse_32 }; struct parse_context { device_t vmbus_dev; enum parse_pass pass; }; static ACPI_STATUS parse_crs(ACPI_RESOURCE *res, void *ctx) { const struct parse_context *pc = ctx; device_t vmbus_dev = pc->vmbus_dev; struct vmbus_softc *sc = device_get_softc(vmbus_dev); UINT64 start, end; switch (res->Type) { case ACPI_RESOURCE_TYPE_ADDRESS32: start = res->Data.Address32.Address.Minimum; end = res->Data.Address32.Address.Maximum; break; case ACPI_RESOURCE_TYPE_ADDRESS64: start = res->Data.Address64.Address.Minimum; end = res->Data.Address64.Address.Maximum; break; default: /* Unused types. */ return (AE_OK); } /* * We don't use <1MB addresses. */ if (end < 0x100000) return (AE_OK); /* Don't conflict with vTPM. */ if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR) end = VTPM_BASE_ADDR - 1; if ((pc->pass == parse_32 && start < FOUR_GB) || (pc->pass == parse_64 && start >= FOUR_GB)) pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY, start, end, 0); return (AE_OK); } static void vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass) { struct parse_context pc; ACPI_STATUS status; if (bootverbose) device_printf(dev, "walking _CRS, pass=%d\n", pass); pc.vmbus_dev = vmbus_dev; pc.pass = pass; status = AcpiWalkResources(acpi_get_handle(dev), "_CRS", parse_crs, &pc); if (bootverbose && ACPI_FAILURE(status)) device_printf(dev, "_CRS: not found, pass=%d\n", pass); } static void vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass) { device_t acpi0, parent; parent = device_get_parent(dev); acpi0 = device_get_parent(parent); if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) { device_t *children; int count; /* * Try to locate VMBUS resources and find _CRS on them. */ if (device_get_children(acpi0, &children, &count) == 0) { int i; for (i = 0; i < count; ++i) { if (!device_is_attached(children[i])) continue; if (strcmp("vmbus_res", device_get_name(children[i])) == 0) vmbus_get_crs(children[i], dev, pass); } free(children, M_TEMP); } /* * Try to find _CRS on acpi. */ vmbus_get_crs(acpi0, dev, pass); } else { device_printf(dev, "not grandchild of acpi\n"); } /* * Try to find _CRS on parent. */ vmbus_get_crs(parent, dev, pass); } static void vmbus_get_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); /* * We walk the resources twice to make sure that: in the resource * list, the 32-bit resources appear behind the 64-bit resources. * NB: resource_list_add() uses INSERT_TAIL. This way, when we * iterate through the list to find a range for a 64-bit BAR in * vmbus_alloc_resource(), we can make sure we try to use >4GB * ranges first. */ pcib_host_res_init(dev, &sc->vmbus_mmio_res); vmbus_get_mmio_res_pass(dev, parse_64); vmbus_get_mmio_res_pass(dev, parse_32); } static void vmbus_free_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); pcib_host_res_free(dev, &sc->vmbus_mmio_res); } #endif /* NEW_PCIB */ static void vmbus_identify(driver_t *driver, device_t parent) { if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return; device_add_child(parent, "vmbus", -1); } static int vmbus_probe(device_t dev) { if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return (ENXIO); device_set_desc(dev, "Hyper-V Vmbus"); return (BUS_PROBE_DEFAULT); } /** * @brief Main vmbus driver initialization routine. * * Here, we * - initialize the vmbus driver context * - setup various driver entry points * - invoke the vmbus hv main init routine * - get the irq resource * - invoke the vmbus to add the vmbus root device * - setup the vmbus root device * - retrieve the channel offers */ static int vmbus_doattach(struct vmbus_softc *sc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int ret; if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED) return (0); #ifdef NEW_PCIB vmbus_get_mmio_res(sc->vmbus_dev); #endif sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; sc->vmbus_gpadl = VMBUS_GPADL_START; mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_prichans); mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_chans); sc->vmbus_chmap = malloc( sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, M_WAITOK | M_ZERO); /* * Create context for "post message" Hypercalls */ sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, sizeof(struct vmbus_msghc)); if (sc->vmbus_xc == NULL) { ret = ENXIO; goto cleanup; } /* * Allocate DMA stuffs. */ ret = vmbus_dma_alloc(sc); if (ret != 0) goto cleanup; /* * Setup interrupt. */ ret = vmbus_intr_setup(sc); if (ret != 0) goto cleanup; /* * Setup SynIC. */ if (bootverbose) device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started); smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc); sc->vmbus_flags |= VMBUS_FLAG_SYNIC; /* * Initialize vmbus, e.g. connect to Hypervisor. */ ret = vmbus_init(sc); if (ret != 0) goto cleanup; if (sc->vmbus_version == VMBUS_VERSION_WS2008 || sc->vmbus_version == VMBUS_VERSION_WIN7) sc->vmbus_event_proc = vmbus_event_proc_compat; else sc->vmbus_event_proc = vmbus_event_proc; ret = vmbus_scan(sc); if (ret != 0) goto cleanup; ctx = device_get_sysctl_ctx(sc->vmbus_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev)); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, vmbus_sysctl_version, "A", "vmbus version"); return (ret); cleanup: vmbus_scan_teardown(sc); vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); return (ret); } static void vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused) { } #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *xsc) { struct vmbus_softc *sc = xsc; if (bootverbose) device_printf(sc->vmbus_dev, "intrhook\n"); vmbus_doattach(sc); config_intrhook_disestablish(&sc->vmbus_intrhook); } #endif /* EARLY_AP_STARTUP */ static int vmbus_attach(device_t dev) { vmbus_sc = device_get_softc(dev); vmbus_sc->vmbus_dev = dev; vmbus_sc->vmbus_idtvec = -1; /* * Event processing logic will be configured: * - After the vmbus protocol version negotiation. * - Before we request channel offers. */ vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy; #ifdef EARLY_AP_STARTUP /* * Defer the real attach until the pause(9) works as expected. */ vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook; vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc; config_intrhook_establish(&vmbus_sc->vmbus_intrhook); #else /* !EARLY_AP_STARTUP */ /* * If the system has already booted and thread * scheduling is possible indicated by the global * cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(vmbus_sc); #endif /* EARLY_AP_STARTUP */ return (0); } static int vmbus_detach(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); bus_generic_detach(dev); vmbus_chan_destroy_all(sc); vmbus_scan_teardown(sc); vmbus_disconnect(sc); if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC; smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL); } vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); #ifdef NEW_PCIB vmbus_free_mmio_res(dev); #endif return (0); } #ifndef EARLY_AP_STARTUP static void vmbus_sysinit(void *arg __unused) { struct vmbus_softc *sc = vmbus_get_softc(); if (vm_guest != VM_GUEST_HV || sc == NULL) return; /* * If the system has already booted and thread * scheduling is possible, as indicated by the * global cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(sc); } /* * NOTE: * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is * initialized. */ SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL); #endif /* !EARLY_AP_STARTUP */ Index: head/sys/dev/hyperv/vmbus/vmbus_br.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_br.c (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus_br.c (revision 361275) @@ -1,407 +1,719 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include /* Amount of space available for write */ #define VMBUS_BR_WAVAIL(r, w, z) \ (((w) >= (r)) ? ((z) - ((w) - (r))) : ((r) - (w))) /* Increase bufing index */ #define VMBUS_BR_IDXINC(idx, inc, sz) (((idx) + (inc)) % (sz)) static int vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS); static int vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS); static void vmbus_br_setup(struct vmbus_br *, void *, int); static int vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS) { const struct vmbus_br *br = arg1; - uint32_t rindex, windex, imask, ravail, wavail; + uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail; + uint64_t intrcnt; char state[256]; + intrcnt = br->vbr_intrcnt; rindex = br->vbr_rindex; windex = br->vbr_windex; imask = br->vbr_imask; + psndsz = br->vbr_psndsz; + fvalue = br->vbr_fvalue; wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize); ravail = br->vbr_dsize - wavail; snprintf(state, sizeof(state), - "rindex:%u windex:%u imask:%u ravail:%u wavail:%u", - rindex, windex, imask, ravail, wavail); + "intrcnt:%lu rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u " + "ravail:%u wavail:%u", + intrcnt, rindex, windex, imask, psndsz, fvalue, ravail, wavail); return sysctl_handle_string(oidp, state, sizeof(state), req); } /* * Binary bufring states. */ static int vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS) { #define BR_STATE_RIDX 0 #define BR_STATE_WIDX 1 #define BR_STATE_IMSK 2 -#define BR_STATE_RSPC 3 -#define BR_STATE_WSPC 4 -#define BR_STATE_MAX 5 +#define BR_STATE_PSSZ 3 +#define BR_STATE_FVAL 4 +#define BR_STATE_RSPC 5 +#define BR_STATE_WSPC 6 +#define BR_STATE_MAX 7 const struct vmbus_br *br = arg1; uint32_t rindex, windex, wavail, state[BR_STATE_MAX]; rindex = br->vbr_rindex; windex = br->vbr_windex; wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize); state[BR_STATE_RIDX] = rindex; state[BR_STATE_WIDX] = windex; state[BR_STATE_IMSK] = br->vbr_imask; + state[BR_STATE_PSSZ] = br->vbr_psndsz; + state[BR_STATE_FVAL] = br->vbr_fvalue; state[BR_STATE_WSPC] = wavail; state[BR_STATE_RSPC] = br->vbr_dsize - wavail; return sysctl_handle_opaque(oidp, state, sizeof(state), req); } void vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree, struct vmbus_br *br, const char *name) { struct sysctl_oid *tree; char desc[64]; tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(br_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (tree == NULL) return; snprintf(desc, sizeof(desc), "%s state", name); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, br, 0, vmbus_br_sysctl_state, "A", desc); snprintf(desc, sizeof(desc), "%s binary state", name); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state_bin", CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, br, 0, vmbus_br_sysctl_state_bin, "IU", desc); } void vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr) { rbr->rxbr_imask = 1; mb(); } static __inline uint32_t vmbus_rxbr_avail(const struct vmbus_rxbr *rbr) { uint32_t rindex, windex; /* Get snapshot */ rindex = rbr->rxbr_rindex; windex = rbr->rxbr_windex; return (rbr->rxbr_dsize - VMBUS_BR_WAVAIL(rindex, windex, rbr->rxbr_dsize)); } uint32_t +vmbus_rxbr_available(const struct vmbus_rxbr *rbr) +{ + return (vmbus_rxbr_avail(rbr)); +} + +uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr) { rbr->rxbr_imask = 0; mb(); /* * Now check to see if the ring buffer is still empty. * If it is not, we raced and we need to process new * incoming channel packets. */ return vmbus_rxbr_avail(rbr); } static void vmbus_br_setup(struct vmbus_br *br, void *buf, int blen) { br->vbr = buf; br->vbr_dsize = blen - sizeof(struct vmbus_bufring); } void vmbus_rxbr_init(struct vmbus_rxbr *rbr) { mtx_init(&rbr->rxbr_lock, "vmbus_rxbr", NULL, MTX_SPIN); } void vmbus_rxbr_deinit(struct vmbus_rxbr *rbr) { mtx_destroy(&rbr->rxbr_lock); } void vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen) { vmbus_br_setup(&rbr->rxbr, buf, blen); } +static __inline boolean_t +vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read) +{ + uint32_t pending_snd_sz, canwrite_size; + + /* No need to signal if host doesn't want us to */ + if (!rbr->rxbr_fpsndsz) + return false; + + mb(); + + pending_snd_sz = rbr->rxbr_psndsz; + /* No need to signal if host sets pending_snd_sz to 0 */ + if (!pending_snd_sz) + return false; + + mb(); + + canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr); + + /* No need to signal if br already has enough space before read */ + if (canwrite_size - bytes_read > pending_snd_sz) + return false; + + /* + * No need to signal if still doesn't have enough space + * asked by host + */ + if (canwrite_size <= pending_snd_sz) + return false; + + return true; +} + void vmbus_txbr_init(struct vmbus_txbr *tbr) { mtx_init(&tbr->txbr_lock, "vmbus_txbr", NULL, MTX_SPIN); } void vmbus_txbr_deinit(struct vmbus_txbr *tbr) { mtx_destroy(&tbr->txbr_lock); } void vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen) { vmbus_br_setup(&tbr->txbr, buf, blen); + + /* Set feature bit enabling flow control */ + tbr->txbr_fpsndsz = 1; } +uint32_t +vmbus_txbr_get_imask(const struct vmbus_txbr *tbr) +{ + mb(); + + return(tbr->txbr_imask); +} + +void +vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size) +{ + tbr->txbr_psndsz = size; +} + /* * When we write to the ring buffer, check if the host needs to be * signaled. * * The contract: * - The host guarantees that while it is draining the TX bufring, * it will set the br_imask to indicate it does not need to be * interrupted when new data are added. * - The host guarantees that it will completely drain the TX bufring * before exiting the read loop. Further, once the TX bufring is * empty, it will clear the br_imask and re-check to see if new * data have arrived. */ static __inline boolean_t vmbus_txbr_need_signal(const struct vmbus_txbr *tbr, uint32_t old_windex) { mb(); if (tbr->txbr_imask) return (FALSE); __compiler_membar(); /* * This is the only case we need to signal when the * ring transitions from being empty to non-empty. */ if (old_windex == tbr->txbr_rindex) return (TRUE); return (FALSE); } static __inline uint32_t vmbus_txbr_avail(const struct vmbus_txbr *tbr) { uint32_t rindex, windex; /* Get snapshot */ rindex = tbr->txbr_rindex; windex = tbr->txbr_windex; return VMBUS_BR_WAVAIL(rindex, windex, tbr->txbr_dsize); } static __inline uint32_t vmbus_txbr_copyto(const struct vmbus_txbr *tbr, uint32_t windex, const void *src0, uint32_t cplen) { const uint8_t *src = src0; uint8_t *br_data = tbr->txbr_data; uint32_t br_dsize = tbr->txbr_dsize; if (cplen > br_dsize - windex) { uint32_t fraglen = br_dsize - windex; /* Wrap-around detected */ memcpy(br_data + windex, src, fraglen); memcpy(br_data, src + fraglen, cplen - fraglen); } else { memcpy(br_data + windex, src, cplen); } return VMBUS_BR_IDXINC(windex, cplen, br_dsize); } +static __inline uint32_t +vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex, + uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret) +{ + uint8_t *br_data = tbr->txbr_data; + uint32_t br_dsize = tbr->txbr_dsize; + int err = 0; + + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + err = cb((void *)(br_data + windex), fraglen, cbarg); + if (!err) + err = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + err = cb((void *)(br_data + windex), cplen, cbarg); + } + + *ret = err; + + return VMBUS_BR_IDXINC(windex, cplen, br_dsize); +} + +uint32_t +vmbus_txbr_available(const struct vmbus_txbr *tbr) +{ + return (vmbus_txbr_avail(tbr)); +} + /* + * NOTE: + * Not holding lock when calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig) +{ + uint32_t old_windex, windex, total; + uint64_t save_windex; + int i; + int cb_ret = 0; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + + /* + * NOTE: + * If this write is going to make br_windex same as br_rindex, + * i.e. the available space for write is same as the write size, + * we can't do it then, since br_windex == br_rindex means that + * the bufring is empty. + */ + if (vmbus_txbr_avail(tbr) <= total) { + return (EAGAIN); + } + + /* Save br_windex for later use */ + old_windex = tbr->txbr_windex; + + /* + * Copy the scattered channel packet to the TX bufring. + */ + windex = old_windex; + for (i = 0; i < iovlen; i++) { + if (iov[i].iov_base != NULL) { + windex = vmbus_txbr_copyto(tbr, windex, + iov[i].iov_base, iov[i].iov_len); + } else if (cb != NULL) { + windex = vmbus_txbr_copyto_call(tbr, windex, + iov[i].iov_len, cb, cbarg, &cb_ret); + /* + * If callback fails, return without updating + * write index. + */ + if (cb_ret) + return (cb_ret); + } + } + + mtx_lock_spin(&tbr->txbr_lock); + + /* + * Set the offset of the current channel packet. + */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* + * Update the write index _after_ the channel packet + * is copied. + */ + __compiler_membar(); + tbr->txbr_windex = windex; + + mtx_unlock_spin(&tbr->txbr_lock); + + if (need_sig) + *need_sig = vmbus_txbr_need_signal(tbr, old_windex); + + return (0); +} + +/* * Write scattered channel packet to TX bufring. * * The offset of this channel packet is written as a 64bits value * immediately after this channel packet. */ int vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen, boolean_t *need_sig) { uint32_t old_windex, windex, total; uint64_t save_windex; int i; total = 0; for (i = 0; i < iovlen; i++) total += iov[i].iov_len; total += sizeof(save_windex); mtx_lock_spin(&tbr->txbr_lock); /* * NOTE: * If this write is going to make br_windex same as br_rindex, * i.e. the available space for write is same as the write size, * we can't do it then, since br_windex == br_rindex means that * the bufring is empty. */ if (vmbus_txbr_avail(tbr) <= total) { mtx_unlock_spin(&tbr->txbr_lock); return (EAGAIN); } /* Save br_windex for later use */ old_windex = tbr->txbr_windex; /* * Copy the scattered channel packet to the TX bufring. */ windex = old_windex; for (i = 0; i < iovlen; i++) { windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, iov[i].iov_len); } /* * Set the offset of the current channel packet. */ save_windex = ((uint64_t)old_windex) << 32; windex = vmbus_txbr_copyto(tbr, windex, &save_windex, sizeof(save_windex)); /* * Update the write index _after_ the channel packet * is copied. */ __compiler_membar(); tbr->txbr_windex = windex; mtx_unlock_spin(&tbr->txbr_lock); *need_sig = vmbus_txbr_need_signal(tbr, old_windex); return (0); } static __inline uint32_t vmbus_rxbr_copyfrom(const struct vmbus_rxbr *rbr, uint32_t rindex, void *dst0, int cplen) { uint8_t *dst = dst0; const uint8_t *br_data = rbr->rxbr_data; uint32_t br_dsize = rbr->rxbr_dsize; if (cplen > br_dsize - rindex) { uint32_t fraglen = br_dsize - rindex; /* Wrap-around detected. */ memcpy(dst, br_data + rindex, fraglen); memcpy(dst + fraglen, br_data, cplen - fraglen); } else { memcpy(dst, br_data + rindex, cplen); } return VMBUS_BR_IDXINC(rindex, cplen, br_dsize); } +static __inline uint32_t +vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex, + int cplen, vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint8_t *br_data = rbr->rxbr_data; + uint32_t br_dsize = rbr->rxbr_dsize; + int error = 0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + error = cb((void *)(br_data + rindex), fraglen, cbarg); + if (!error) + error = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + error = cb((void *)(br_data + rindex), cplen, cbarg); + } + return (error); +} + int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen) { mtx_lock_spin(&rbr->rxbr_lock); /* * The requested data and the 64bits channel packet * offset should be there at least. */ if (vmbus_rxbr_avail(rbr) < dlen + sizeof(uint64_t)) { mtx_unlock_spin(&rbr->rxbr_lock); return (EAGAIN); } vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen); mtx_unlock_spin(&rbr->rxbr_lock); + + return (0); +} + +/* + * NOTE: + * We only hold spin lock to check the ring buffer space. It is + * released before calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint32_t rindex, br_dsize0 = rbr->rxbr_dsize; + int ret; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * The requested data + skip and the 64bits channel packet + * offset should be there at least. + */ + if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0); + mtx_unlock_spin(&rbr->rxbr_lock); + + ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg); + + return (ret); +} + +/* + * NOTE: + * We assume idx_adv == sizeof(channel packet). + */ +int +vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen, + uint32_t idx_adv, boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough data to read. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + if (idx_adv > 0) { + /* + * Advance the read index first, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + } + + vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen); + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + if (idx_adv > 0) + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + + sizeof(uint64_t)); + else + *need_sig = false; + } + + return (0); +} + +/* + * NOTE: + * Just update the RX rb index. + */ +int +vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough space to advance. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + /* + * Advance the read index, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t)); + } return (0); } /* * NOTE: * We assume (dlen + skip) == sizeof(channel packet). */ int vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip) { uint32_t rindex, br_dsize = rbr->rxbr_dsize; KASSERT(dlen + skip > 0, ("invalid dlen %d, offset %u", dlen, skip)); mtx_lock_spin(&rbr->rxbr_lock); if (vmbus_rxbr_avail(rbr) < dlen + skip + sizeof(uint64_t)) { mtx_unlock_spin(&rbr->rxbr_lock); return (EAGAIN); } /* * Copy channel packet from RX bufring. */ rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize); rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); /* * Discard this channel packet's 64bits offset, which is useless to us. */ rindex = VMBUS_BR_IDXINC(rindex, sizeof(uint64_t), br_dsize); /* * Update the read index _after_ the channel packet is fetched. */ __compiler_membar(); rbr->rxbr_rindex = rindex; mtx_unlock_spin(&rbr->rxbr_lock); return (0); } Index: head/sys/dev/hyperv/vmbus/vmbus_brvar.h =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_brvar.h (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus_brvar.h (revision 361275) @@ -1,130 +1,157 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_BRVAR_H_ #define _VMBUS_BRVAR_H_ #include #include #include #include struct vmbus_br { struct vmbus_bufring *vbr; uint32_t vbr_dsize; /* total data size */ }; #define vbr_windex vbr->br_windex #define vbr_rindex vbr->br_rindex #define vbr_imask vbr->br_imask +#define vbr_psndsz vbr->br_pending_snd_sz +#define vbr_fpsndsz vbr->br_feature_bits.feat_pending_snd_sz +#define vbr_fvalue vbr->br_feature_bits.value +#define vbr_intrcnt vbr->br_g2h_intr_cnt #define vbr_data vbr->br_data struct vmbus_rxbr { struct mtx rxbr_lock; struct vmbus_br rxbr; }; #define rxbr_windex rxbr.vbr_windex #define rxbr_rindex rxbr.vbr_rindex #define rxbr_imask rxbr.vbr_imask +#define rxbr_psndsz rxbr.vbr_psndsz +#define rxbr_fpsndsz rxbr.vbr_fpsndsz +#define rxbr_fvalue rxbr.vbr_fvalue +#define rxbr_intrcnt rxbr.vbr_intrcnt #define rxbr_data rxbr.vbr_data #define rxbr_dsize rxbr.vbr_dsize struct vmbus_txbr { struct mtx txbr_lock; struct vmbus_br txbr; }; #define txbr_windex txbr.vbr_windex #define txbr_rindex txbr.vbr_rindex #define txbr_imask txbr.vbr_imask +#define txbr_psndsz txbr.vbr_psndsz +#define txbr_fpsndsz txbr.vbr_fpsndsz +#define txbr_fvalue txbr.vbr_fvalue +#define txbr_intrcnt txbr.vbr_intrcnt #define txbr_data txbr.vbr_data #define txbr_dsize txbr.vbr_dsize struct sysctl_ctx_list; struct sysctl_oid; static __inline int vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr) { /* * - 64 bits for the trailing start index (- sizeof(uint64_t)). * - The rindex and windex can't be same (- 1). See * the comment near vmbus_bufring.br_{r,w}index. */ return (tbr->txbr_dsize - sizeof(uint64_t) - 1); } static __inline bool vmbus_txbr_empty(const struct vmbus_txbr *tbr) { return (tbr->txbr_windex == tbr->txbr_rindex ? true : false); } static __inline bool vmbus_rxbr_empty(const struct vmbus_rxbr *rbr) { return (rbr->rxbr_windex == rbr->rxbr_rindex ? true : false); } static __inline int vmbus_br_nelem(int br_size, int elem_size) { /* Strip bufring header */ br_size -= sizeof(struct vmbus_bufring); /* Add per-element trailing index */ elem_size += sizeof(uint64_t); return (br_size / elem_size); } void vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree, struct vmbus_br *br, const char *name); void vmbus_rxbr_init(struct vmbus_rxbr *rbr); void vmbus_rxbr_deinit(struct vmbus_rxbr *rbr); void vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen); int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen); int vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip); +int vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig); +int vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, + int dlen, uint32_t idx_adv, boolean_t *need_sig); +int vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg); void vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr); uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr); +uint32_t vmbus_rxbr_available(const struct vmbus_rxbr *rbr); void vmbus_txbr_init(struct vmbus_txbr *tbr); void vmbus_txbr_deinit(struct vmbus_txbr *tbr); void vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen); int vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen, boolean_t *need_sig); +int vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig); +uint32_t vmbus_txbr_available(const struct vmbus_txbr *tbr); +uint32_t vmbus_txbr_get_imask(const struct vmbus_txbr *tbr); +void vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, + uint32_t size); #endif /* _VMBUS_BRVAR_H_ */ Index: head/sys/dev/hyperv/vmbus/vmbus_chan.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_chan.c (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus_chan.c (revision 361275) @@ -1,2206 +1,2390 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmbus_chan_pollarg { struct vmbus_channel *poll_chan; u_int poll_hz; }; static void vmbus_chan_update_evtflagcnt( struct vmbus_softc *, const struct vmbus_channel *); static int vmbus_chan_close_internal( struct vmbus_channel *); static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS); static void vmbus_chan_sysctl_create( struct vmbus_channel *); static struct vmbus_channel *vmbus_chan_alloc(struct vmbus_softc *); static void vmbus_chan_free(struct vmbus_channel *); static int vmbus_chan_add(struct vmbus_channel *); static void vmbus_chan_cpu_default(struct vmbus_channel *); static int vmbus_chan_release(struct vmbus_channel *); static void vmbus_chan_set_chmap(struct vmbus_channel *); static void vmbus_chan_clear_chmap(struct vmbus_channel *); static void vmbus_chan_detach(struct vmbus_channel *); static bool vmbus_chan_wait_revoke( const struct vmbus_channel *, bool); static void vmbus_chan_poll_timeout(void *); static bool vmbus_chan_poll_cancel_intq( struct vmbus_channel *); static void vmbus_chan_poll_cancel(struct vmbus_channel *); static void vmbus_chan_ins_prilist(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_rem_prilist(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_ins_list(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_rem_list(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_ins_sublist(struct vmbus_channel *, struct vmbus_channel *); static void vmbus_chan_rem_sublist(struct vmbus_channel *, struct vmbus_channel *); static void vmbus_chan_task(void *, int); static void vmbus_chan_task_nobatch(void *, int); static void vmbus_chan_poll_task(void *, int); static void vmbus_chan_clrchmap_task(void *, int); static void vmbus_chan_pollcfg_task(void *, int); static void vmbus_chan_polldis_task(void *, int); static void vmbus_chan_poll_cancel_task(void *, int); static void vmbus_prichan_attach_task(void *, int); static void vmbus_subchan_attach_task(void *, int); static void vmbus_prichan_detach_task(void *, int); static void vmbus_subchan_detach_task(void *, int); static void vmbus_chan_msgproc_choffer(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chan_msgproc_chrescind( struct vmbus_softc *, const struct vmbus_message *); static int vmbus_chan_printf(const struct vmbus_channel *, const char *, ...) __printflike(2, 3); /* * Vmbus channel message processing. */ static const vmbus_chanmsg_proc_t vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER, vmbus_chan_msgproc_choffer), VMBUS_CHANMSG_PROC(CHRESCIND, vmbus_chan_msgproc_chrescind), VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP) }; /* - * Notify host that there are data pending on our TX bufring. + * Notify host that there are data pending on our TX bufring or + * we have put some data on the TX bufring. */ static __inline void -vmbus_chan_signal_tx(const struct vmbus_channel *chan) +vmbus_chan_signal(const struct vmbus_channel *chan) { atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask); if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask); else hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); } +static __inline void +vmbus_chan_signal_tx(struct vmbus_channel *chan) +{ + chan->ch_txbr.txbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + +static __inline void +vmbus_chan_signal_rx(struct vmbus_channel *chan) +{ + chan->ch_rxbr.rxbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + static void vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONPRIL_SHIFT)) panic("channel is already on the prilist"); TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink); } static void vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0) panic("channel is not on the prilist"); TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); } static void vmbus_chan_ins_sublist(struct vmbus_channel *prichan, struct vmbus_channel *chan) { mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONSUBL_SHIFT)) panic("channel is already on the sublist"); TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink); /* Bump sub-channel count. */ prichan->ch_subchan_cnt++; } static void vmbus_chan_rem_sublist(struct vmbus_channel *prichan, struct vmbus_channel *chan) { mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); KASSERT(prichan->ch_subchan_cnt > 0, ("invalid subchan_cnt %d", prichan->ch_subchan_cnt)); prichan->ch_subchan_cnt--; if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0) panic("channel is not on the sublist"); TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink); } static void vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONLIST_SHIFT)) panic("channel is already on the list"); TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link); } static void vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONLIST_SHIFT) == 0) panic("channel is not on the list"); TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link); } static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS) { struct vmbus_channel *chan = arg1; int mnf = 0; if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) mnf = 1; return sysctl_handle_int(oidp, &mnf, 0, req); } static void vmbus_chan_sysctl_create(struct vmbus_channel *chan) { struct sysctl_oid *ch_tree, *chid_tree, *br_tree; struct sysctl_ctx_list *ctx; uint32_t ch_id; char name[16]; /* * Add sysctl nodes related to this channel to this * channel's sysctl ctx, so that they can be destroyed * independently upon close of this channel, which can * happen even if the device is not detached. */ ctx = &chan->ch_sysctl_ctx; sysctl_ctx_init(ctx); /* * Create dev.NAME.UNIT.channel tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)), OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID tree. */ if (VMBUS_CHAN_ISPRIMARY(chan)) ch_id = chan->ch_id; else ch_id = chan->ch_prichan->ch_id; snprintf(name, sizeof(name), "%d", ch_id); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Create dev.NAME.UNIT.channel.CHANID.sub tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree. * * NOTE: * chid_tree is changed to this new sysctl tree. */ snprintf(name, sizeof(name), "%d", chan->ch_subidx); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id"); } SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, chan, 0, vmbus_chan_sysctl_mnf, "I", "has monitor notification facilities"); br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (br_tree != NULL) { /* * Create sysctl tree for RX bufring. */ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx"); /* * Create sysctl tree for TX bufring. */ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx"); } } int vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { struct vmbus_chan_br cbr; int error; /* * Allocate the TX+RX bufrings. */ KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated")); chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev), PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, BUS_DMA_WAITOK); if (chan->ch_bufring == NULL) { vmbus_chan_printf(chan, "bufring allocation failed\n"); return (ENOMEM); } cbr.cbr = chan->ch_bufring; cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr; cbr.cbr_txsz = txbr_size; cbr.cbr_rxsz = rxbr_size; error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg); if (error) { if (error == EISCONN) { /* * XXX * The bufring GPADL is still connected; abandon * this bufring, instead of having mysterious * crash or trashed data later on. */ vmbus_chan_printf(chan, "chan%u bufring GPADL " "is still connected upon channel open error; " "leak %d bytes memory\n", chan->ch_id, txbr_size + rxbr_size); } else { hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); } chan->ch_bufring = NULL; } return (error); } int vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { struct vmbus_softc *sc = chan->ch_vmbus; const struct vmbus_message *msg; struct vmbus_chanmsg_chopen *req; struct vmbus_msghc *mh; uint32_t status; int error, txbr_size, rxbr_size; task_fn_t *task_fn; uint8_t *br; if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) { vmbus_chan_printf(chan, "invalid udata len %d for chan%u\n", udlen, chan->ch_id); return (EINVAL); } br = cbr->cbr; txbr_size = cbr->cbr_txsz; rxbr_size = cbr->cbr_rxsz; KASSERT((txbr_size & PAGE_MASK) == 0, ("send bufring size is not multiple page")); KASSERT((rxbr_size & PAGE_MASK) == 0, ("recv bufring size is not multiple page")); KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0, ("bufring is not page aligned")); /* * Zero out the TX/RX bufrings, in case that they were used before. */ memset(br, 0, txbr_size + rxbr_size); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED_SHIFT)) panic("double-open chan%u", chan->ch_id); chan->ch_cb = cb; chan->ch_cbarg = cbarg; vmbus_chan_update_evtflagcnt(sc, chan); chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) task_fn = vmbus_chan_task; else task_fn = vmbus_chan_task_nobatch; TASK_INIT(&chan->ch_task, 0, task_fn, chan); /* TX bufring comes first */ vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size); /* RX bufring immediately follows TX bufring */ vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size); /* Create sysctl tree for this channel */ vmbus_chan_sysctl_create(chan); /* * Connect the bufrings, both RX and TX, to this channel. */ error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr, txbr_size + rxbr_size, &chan->ch_bufring_gpadl); if (error) { vmbus_chan_printf(chan, "failed to connect bufring GPADL to chan%u\n", chan->ch_id); goto failed; } /* * Install this channel, before it is opened, but after everything * else has been setup. */ vmbus_chan_set_chmap(chan); /* * Open channel w/ the bufring GPADL on the target CPU. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chopen(chan%u)\n", chan->ch_id); error = ENXIO; goto failed; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN; req->chm_chanid = chan->ch_id; req->chm_openid = chan->ch_id; req->chm_gpadl = chan->ch_bufring_gpadl; req->chm_vcpuid = chan->ch_vcpuid; req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT; if (udlen > 0) memcpy(req->chm_udata, udata, udlen); error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_chan_printf(chan, "chopen(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); goto failed; } for (;;) { msg = vmbus_msghc_poll_result(sc, mh); if (msg != NULL) break; if (vmbus_chan_is_revoked(chan)) { int i; /* * NOTE: * Hypervisor does _not_ send response CHOPEN to * a revoked channel. */ vmbus_chan_printf(chan, "chan%u is revoked, when it is being opened\n", chan->ch_id); /* * XXX * Add extra delay before cancel the hypercall * execution; mainly to close any possible * CHRESCIND and CHOPEN_RESP races on the * hypervisor side. */ #define REVOKE_LINGER 100 for (i = 0; i < REVOKE_LINGER; ++i) { msg = vmbus_msghc_poll_result(sc, mh); if (msg != NULL) break; pause("rchopen", 1); } #undef REVOKE_LINGER if (msg == NULL) vmbus_msghc_exec_cancel(sc, mh); break; } pause("chopen", 1); } if (msg != NULL) { status = ((const struct vmbus_chanmsg_chopen_resp *) msg->msg_data)->chm_status; } else { /* XXX any non-0 value is ok here. */ status = 0xff; } vmbus_msghc_put(sc, mh); if (status == 0) { if (bootverbose) vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id); return (0); } vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id); error = ENXIO; failed: sysctl_ctx_free(&chan->ch_sysctl_ctx); vmbus_chan_clear_chmap(chan); if (chan->ch_bufring_gpadl != 0) { int error1; error1 = vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); if (error1) { /* * Give caller a hint that the bufring GPADL is still * connected. */ error = EISCONN; } chan->ch_bufring_gpadl = 0; } atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); return (error); } int vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl0) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_conn *req; const struct vmbus_message *msg; size_t reqsz; uint32_t gpadl, status; int page_count, range_len, i, cnt, error; uint64_t page_id; KASSERT(*gpadl0 == 0, ("GPADL is not zero")); /* * Preliminary checks. */ KASSERT((size & PAGE_MASK) == 0, ("invalid GPA size %d, not multiple page size", size)); page_count = size >> PAGE_SHIFT; KASSERT((paddr & PAGE_MASK) == 0, ("GPA is not page aligned %jx", (uintmax_t)paddr)); page_id = paddr >> PAGE_SHIFT; range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]); /* * We don't support multiple GPA ranges. */ if (range_len > UINT16_MAX) { vmbus_chan_printf(chan, "GPA too large, %d pages\n", page_count); return EOPNOTSUPP; } /* * Allocate GPADL id. */ gpadl = vmbus_gpadl_alloc(sc); /* * Connect this GPADL to the target channel. * * NOTE: * Since each message can only hold small set of page * addresses, several messages may be required to * complete the connection. */ if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn, chm_range.gpa_page[cnt]); mh = vmbus_msghc_get(sc, reqsz); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for gpadl_conn(chan%u)\n", chan->ch_id); return EIO; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; req->chm_range_len = range_len; req->chm_range_cnt = 1; req->chm_range.gpa_len = size; req->chm_range.gpa_ofs = 0; for (i = 0; i < cnt; ++i) req->chm_range.gpa_page[i] = page_id++; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); return error; } while (page_count > 0) { struct vmbus_chanmsg_gpadl_subconn *subreq; if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn, chm_gpa_page[cnt]); vmbus_msghc_reset(mh, reqsz); subreq = vmbus_msghc_dataptr(mh); subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN; subreq->chm_gpadl = gpadl; for (i = 0; i < cnt; ++i) subreq->chm_gpa_page[i] = page_id++; vmbus_msghc_exec_noresult(mh); } KASSERT(page_count == 0, ("invalid page count %d", page_count)); msg = vmbus_msghc_wait_result(sc, mh); status = ((const struct vmbus_chanmsg_gpadl_connresp *) msg->msg_data)->chm_status; vmbus_msghc_put(sc, mh); if (status != 0) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n", chan->ch_id, status); return EIO; } /* Done; commit the GPADL id. */ *gpadl0 = gpadl; if (bootverbose) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n", chan->ch_id); } return 0; } static bool vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep) { #define WAIT_COUNT 200 /* 200ms */ int i; for (i = 0; i < WAIT_COUNT; ++i) { if (vmbus_chan_is_revoked(chan)) return (true); if (can_sleep) pause("wchrev", 1); else DELAY(1000); } return (false); #undef WAIT_COUNT } /* * Disconnect the GPA from the target channel */ int vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_disconn *req; int error; KASSERT(gpadl != 0, ("GPADL is zero")); mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for gpadl_disconn(chan%u)\n", chan->ch_id); return (EBUSY); } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); if (vmbus_chan_wait_revoke(chan, true)) { /* * Error is benign; this channel is revoked, * so this GPADL will not be touched anymore. */ vmbus_chan_printf(chan, "gpadl_disconn(revoked chan%u) msg hypercall " "exec failed: %d\n", chan->ch_id, error); return (0); } vmbus_chan_printf(chan, "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); return (error); } vmbus_msghc_wait_result(sc, mh); /* Discard result; no useful information */ vmbus_msghc_put(sc, mh); return (0); } static void vmbus_chan_detach(struct vmbus_channel *chan) { int refs; KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d", chan->ch_id, chan->ch_refs)); refs = atomic_fetchadd_int(&chan->ch_refs, -1); #ifdef INVARIANTS if (VMBUS_CHAN_ISPRIMARY(chan)) { KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan", chan->ch_id, refs + 1)); } #endif if (refs == 1) { /* * Detach the target channel. */ if (bootverbose) { vmbus_chan_printf(chan, "chan%u detached\n", chan->ch_id); } taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } } static void vmbus_chan_clrchmap_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; } static void vmbus_chan_clear_chmap(struct vmbus_channel *chan) { struct task chmap_task; TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan); vmbus_chan_run_task(chan, &chmap_task); } static void vmbus_chan_set_chmap(struct vmbus_channel *chan) { __compiler_membar(); chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; } static void vmbus_chan_poll_cancel_task(void *xchan, int pending __unused) { vmbus_chan_poll_cancel_intq(xchan); } static void vmbus_chan_poll_cancel(struct vmbus_channel *chan) { struct task poll_cancel; TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan); vmbus_chan_run_task(chan, &poll_cancel); } static int vmbus_chan_close_internal(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_chclose *req; uint32_t old_stflags; int error; /* * NOTE: * Sub-channels are closed upon their primary channel closing, * so they can be closed even before they are opened. */ for (;;) { old_stflags = chan->ch_stflags; if (atomic_cmpset_int(&chan->ch_stflags, old_stflags, old_stflags & ~VMBUS_CHAN_ST_OPENED)) break; } if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) { /* Not opened yet; done */ if (bootverbose) { vmbus_chan_printf(chan, "chan%u not opened\n", chan->ch_id); } return (0); } /* * Free this channel's sysctl tree attached to its device's * sysctl tree. */ sysctl_ctx_free(&chan->ch_sysctl_ctx); /* * Cancel polling, if it is enabled. */ vmbus_chan_poll_cancel(chan); /* * NOTE: * Order is critical. This channel _must_ be uninstalled first, * else the channel task may be enqueued by the IDT after it has * been drained. */ vmbus_chan_clear_chmap(chan); taskqueue_drain(chan->ch_tq, &chan->ch_task); chan->ch_tq = NULL; /* * Close this channel. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chclose(chan%u)\n", chan->ch_id); error = ENXIO; goto disconnect; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { vmbus_chan_printf(chan, "chclose(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); goto disconnect; } if (bootverbose) vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id); disconnect: /* * Disconnect the TX+RX bufrings from this channel. */ if (chan->ch_bufring_gpadl != 0) { int error1; error1 = vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); if (error1) { /* * XXX * The bufring GPADL is still connected; abandon * this bufring, instead of having mysterious * crash or trashed data later on. */ vmbus_chan_printf(chan, "chan%u bufring GPADL " "is still connected after close\n", chan->ch_id); chan->ch_bufring = NULL; /* * Give caller a hint that the bufring GPADL is * still connected. */ error = EISCONN; } chan->ch_bufring_gpadl = 0; } /* * Destroy the TX+RX bufrings. */ if (chan->ch_bufring != NULL) { hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); chan->ch_bufring = NULL; } return (error); } int vmbus_chan_close_direct(struct vmbus_channel *chan) { int error; #ifdef INVARIANTS if (VMBUS_CHAN_ISPRIMARY(chan)) { struct vmbus_channel *subchan; /* * All sub-channels _must_ have been closed, or are _not_ * opened at all. */ mtx_lock(&chan->ch_subchan_lock); TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) { KASSERT( (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0, ("chan%u: subchan%u is still opened", chan->ch_id, subchan->ch_subidx)); } mtx_unlock(&chan->ch_subchan_lock); } #endif error = vmbus_chan_close_internal(chan); if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * This sub-channel is referenced, when it is linked to * the primary channel; drop that reference now. */ vmbus_chan_detach(chan); } return (error); } /* * Caller should make sure that all sub-channels have * been added to 'chan' and all to-be-closed channels * are not being opened. */ void vmbus_chan_close(struct vmbus_channel *chan) { int subchan_cnt; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Sub-channel is closed when its primary channel * is closed; done. */ return; } /* * Close all sub-channels, if any. */ subchan_cnt = chan->ch_subchan_cnt; if (subchan_cnt > 0) { struct vmbus_channel **subchan; int i; subchan = vmbus_subchan_get(chan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { vmbus_chan_close_internal(subchan[i]); /* * This sub-channel is referenced, when it is * linked to the primary channel; drop that * reference now. */ vmbus_chan_detach(subchan[i]); } vmbus_subchan_rel(subchan, subchan_cnt); } /* Then close the primary channel. */ vmbus_chan_close_internal(chan); } void vmbus_chan_intr_drain(struct vmbus_channel *chan) { taskqueue_drain(chan->ch_tq, &chan->ch_task); } +uint32_t +vmbus_chan_write_available(struct vmbus_channel *chan) +{ + return (vmbus_txbr_available(&chan->ch_txbr)); +} + +bool +vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size) +{ + if (min_signal_size >= 0 && + vmbus_chan_write_available(chan) > min_signal_size) { + return false; + } + + if (!vmbus_txbr_get_imask(&chan->ch_txbr)) { + /* txbr imask is not set, signal the reader */ + vmbus_chan_signal_tx(chan); + return true; + } + + return false; +} + +void +vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size) +{ + if (chan) + vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size); +} + int +vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + int error; + boolean_t send_evt; + + if (iovlen == 0) + return (0); + + error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen, + cb, cbarg, &send_evt); + + if (!error && send_evt) { + vmbus_chan_signal_tx(chan); + } + + return error; +} + +int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt pkt; int pktlen, pad_pktlen, hlen, error; uint64_t pad = 0; struct iovec iov[3]; boolean_t send_evt; hlen = sizeof(pkt); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = type; pkt.cp_hdr.cph_flags = flags; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; iov[0].iov_base = &pkt; iov[0].iov_len = hlen; iov[1].iov_base = data; iov[1].iov_len = dlen; iov[2].iov_base = &pad; iov[2].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_sglist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_gpa_cnt = sglen; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = sg; iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen; iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_prplist(struct vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_prplist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; hlen = __offsetof(struct vmbus_chanpkt_prplist, cp_range[0].gpa_page[prp_cnt]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_range_cnt = 1; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = prp; iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]); iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, uint64_t *xactid) { struct vmbus_chanpkt_hdr pkt; int error, dlen, hlen; error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) return (error); if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen); /* XXX this channel is dead actually. */ return (EIO); } if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) { vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", pkt.cph_hlen, pkt.cph_tlen); /* XXX this channel is dead actually. */ return (EIO); } hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen); dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen; if (*dlen0 < dlen) { /* Return the size of this packet's data. */ *dlen0 = dlen; return (ENOBUFS); } *xactid = pkt.cph_xactid; *dlen0 = dlen; /* Skip packet header */ error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen); KASSERT(!error, ("vmbus_rxbr_read failed")); return (0); } int vmbus_chan_recv_pkt(struct vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt, int *pktlen0) { int error, pktlen, pkt_hlen; pkt_hlen = sizeof(*pkt); error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen); if (error) return (error); if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen); /* XXX this channel is dead actually. */ return (EIO); } if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) { vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", pkt->cph_hlen, pkt->cph_tlen); /* XXX this channel is dead actually. */ return (EIO); } pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen); if (*pktlen0 < pktlen) { /* Return the size of this packet. */ *pktlen0 = pktlen; return (ENOBUFS); } *pktlen0 = pktlen; /* * Skip the fixed-size packet header, which has been filled * by the above vmbus_rxbr_peek(). */ error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1, pktlen - pkt_hlen, pkt_hlen); KASSERT(!error, ("vmbus_rxbr_read failed")); return (0); } +uint32_t +vmbus_chan_read_available(struct vmbus_channel *chan) +{ + return (vmbus_rxbr_available(&chan->ch_rxbr)); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + * - Copy data_len bytes in to the buffer pointed by 'data' + * Return 0 if operation succeed. EAGAIN if operations if failed. + * If failed, the buffer pointed by 'data' is intact, and the + * channel read index is not advanced at all. + */ +int +vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (data == NULL || data_len <= 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr, + data, data_len, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + */ +int +vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (advance == 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + + +/* + * Caller should hold its own lock to serialize the ring buffer + * copy. + */ +int +vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg) +{ + if (!chan || data_len <= 0 || cb == NULL) + return (EINVAL); + + return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip, + cb, cbarg)); +} + static void vmbus_chan_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; vmbus_chan_callback_t cb = chan->ch_cb; void *cbarg = chan->ch_cbarg; KASSERT(chan->ch_poll_intvl == 0, ("chan%u: interrupted in polling mode", chan->ch_id)); /* * Optimize host to guest signaling by ensuring: * 1. While reading the channel, we disable interrupts from * host. * 2. Ensure that we process all posted messages from the host * before returning from this callback. * 3. Once we return, enable signaling from the host. Once this * state is set we check to see if additional packets are * available to read. In this case we repeat the process. * * NOTE: Interrupt has been disabled in the ISR. */ for (;;) { uint32_t left; cb(chan, cbarg); left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr); if (left == 0) { /* No more data in RX bufring; done */ break; } vmbus_rxbr_intr_mask(&chan->ch_rxbr); } } static void vmbus_chan_task_nobatch(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl == 0, ("chan%u: interrupted in polling mode", chan->ch_id)); chan->ch_cb(chan, chan->ch_cbarg); } static void vmbus_chan_poll_timeout(void *xchan) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl != 0, ("chan%u: polling timeout in interrupt mode", chan->ch_id)); taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); } static void vmbus_chan_poll_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl != 0, ("chan%u: polling in interrupt mode", chan->ch_id)); callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0, vmbus_chan_poll_timeout, chan, chan->ch_poll_flags); chan->ch_cb(chan, chan->ch_cbarg); } static void vmbus_chan_pollcfg_task(void *xarg, int pending __unused) { const struct vmbus_chan_pollarg *arg = xarg; struct vmbus_channel *chan = arg->poll_chan; sbintime_t intvl; int poll_flags; /* * Save polling interval. */ intvl = SBT_1S / arg->poll_hz; if (intvl == 0) intvl = 1; if (intvl == chan->ch_poll_intvl) { /* Nothing changes; done */ return; } chan->ch_poll_intvl = intvl; /* Adjust callout flags. */ poll_flags = C_DIRECT_EXEC; if (arg->poll_hz <= hz) poll_flags |= C_HARDCLOCK; chan->ch_poll_flags = poll_flags; /* * Disconnect this channel from the channel map to make sure that * the RX bufring interrupt enabling bit can not be touched, and * ISR can not enqueue this channel task anymore. THEN, disable * interrupt from the RX bufring (TX bufring does not generate * interrupt to VM). * * NOTE: order is critical. */ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; __compiler_membar(); vmbus_rxbr_intr_mask(&chan->ch_rxbr); /* * NOTE: * At this point, this channel task will not be enqueued by * the ISR anymore, time to cancel the pending one. */ taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL); /* Kick start! */ taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); } static bool vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan) { if (chan->ch_poll_intvl == 0) { /* Not enabled. */ return (false); } /* * Stop polling callout, so that channel polling task * will not be enqueued anymore. */ callout_drain(&chan->ch_poll_timeo); /* * Disable polling by resetting polling interval. * * NOTE: * The polling interval resetting MUST be conducted * after the callout is drained; mainly to keep the * proper assertion in place. */ chan->ch_poll_intvl = 0; /* * NOTE: * At this point, this channel polling task will not be * enqueued by the callout anymore, time to cancel the * pending one. */ taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL); /* Polling was enabled. */ return (true); } static void vmbus_chan_polldis_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; if (!vmbus_chan_poll_cancel_intq(chan)) { /* Already disabled; done. */ return; } /* * Plug this channel back to the channel map and unmask * the RX bufring interrupt. */ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; __compiler_membar(); vmbus_rxbr_intr_unmask(&chan->ch_rxbr); /* * Kick start the interrupt task, just in case unmasking * interrupt races ISR. */ taskqueue_enqueue(chan->ch_tq, &chan->ch_task); } static __inline void vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, int flag_cnt) { int f; for (f = 0; f < flag_cnt; ++f) { uint32_t chid_base; u_long flags; int chid_ofs; if (event_flags[f] == 0) continue; flags = atomic_swap_long(&event_flags[f], 0); chid_base = f << VMBUS_EVTFLAG_SHIFT; while ((chid_ofs = ffsl(flags)) != 0) { struct vmbus_channel *chan; --chid_ofs; /* NOTE: ffsl is 1-based */ flags &= ~(1UL << chid_ofs); chan = sc->vmbus_chmap[chid_base + chid_ofs]; if (__predict_false(chan == NULL)) { /* Channel is closed. */ continue; } __compiler_membar(); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) vmbus_rxbr_intr_mask(&chan->ch_rxbr); taskqueue_enqueue(chan->ch_tq, &chan->ch_task); } } } void vmbus_event_proc(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; /* * On Host with Win8 or above, the event page can be checked directly * to get the id of the channel that has the pending interrupt. */ eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; vmbus_event_flags_proc(sc, eventf->evt_flags, VMBUS_PCPU_GET(sc, event_flags_cnt, cpu)); } void vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) { vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags, VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT); } } static void vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc, const struct vmbus_channel *chan) { volatile int *flag_cnt_ptr; int flag_cnt; flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1; flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid); for (;;) { int old_flag_cnt; old_flag_cnt = *flag_cnt_ptr; if (old_flag_cnt >= flag_cnt) break; if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) { if (bootverbose) { vmbus_chan_printf(chan, "chan%u update cpu%d flag_cnt to %d\n", chan->ch_id, chan->ch_cpuid, flag_cnt); } break; } } } static struct vmbus_channel * vmbus_chan_alloc(struct vmbus_softc *sc) { struct vmbus_channel *chan; chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO); chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param), &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (chan->ch_monprm == NULL) { device_printf(sc->vmbus_dev, "monprm alloc failed\n"); free(chan, M_DEVBUF); return NULL; } chan->ch_refs = 1; chan->ch_vmbus = sc; mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF); sx_init(&chan->ch_orphan_lock, "vmbus chorphan"); TAILQ_INIT(&chan->ch_subchans); vmbus_rxbr_init(&chan->ch_rxbr); vmbus_txbr_init(&chan->ch_txbr); TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan); callout_init(&chan->ch_poll_timeo, 1); return chan; } static void vmbus_chan_free(struct vmbus_channel *chan) { KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0, ("still owns sub-channels")); KASSERT((chan->ch_stflags & (VMBUS_CHAN_ST_OPENED | VMBUS_CHAN_ST_ONPRIL | VMBUS_CHAN_ST_ONSUBL | VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel")); KASSERT(chan->ch_orphan_xact == NULL, ("still has orphan xact installed")); KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d", chan->ch_id, chan->ch_refs)); KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated", chan->ch_id)); hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm); mtx_destroy(&chan->ch_subchan_lock); sx_destroy(&chan->ch_orphan_lock); vmbus_rxbr_deinit(&chan->ch_rxbr); vmbus_txbr_deinit(&chan->ch_txbr); free(chan, M_DEVBUF); } static int vmbus_chan_add(struct vmbus_channel *newchan) { struct vmbus_softc *sc = newchan->ch_vmbus; struct vmbus_channel *prichan; if (newchan->ch_id == 0) { /* * XXX * Chan0 will neither be processed nor should be offered; * skip it. */ device_printf(sc->vmbus_dev, "got chan0 offer, discard\n"); return EINVAL; } else if (newchan->ch_id >= VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid chan%u offer\n", newchan->ch_id); return EINVAL; } mtx_lock(&sc->vmbus_prichan_lock); TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) { /* * Sub-channel will have the same type GUID and instance * GUID as its primary channel. */ if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type, sizeof(struct hyperv_guid)) == 0 && memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst, sizeof(struct hyperv_guid)) == 0) break; } if (VMBUS_CHAN_ISPRIMARY(newchan)) { if (prichan == NULL) { /* Install the new primary channel */ vmbus_chan_ins_prilist(sc, newchan); mtx_unlock(&sc->vmbus_prichan_lock); goto done; } else { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "duplicated primary chan%u\n", newchan->ch_id); return EINVAL; } } else { /* Sub-channel */ if (prichan == NULL) { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "no primary chan for chan%u\n", newchan->ch_id); return EINVAL; } /* * Found the primary channel for this sub-channel and * move on. * * XXX refcnt prichan */ } mtx_unlock(&sc->vmbus_prichan_lock); /* * This is a sub-channel; link it with the primary channel. */ KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan), ("new channel is not sub-channel")); KASSERT(prichan != NULL, ("no primary channel")); /* * Reference count this sub-channel; it will be dereferenced * when this sub-channel is closed. */ KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d", newchan->ch_id, newchan->ch_refs)); atomic_add_int(&newchan->ch_refs, 1); newchan->ch_prichan = prichan; newchan->ch_dev = prichan->ch_dev; mtx_lock(&prichan->ch_subchan_lock); vmbus_chan_ins_sublist(prichan, newchan); mtx_unlock(&prichan->ch_subchan_lock); /* * Notify anyone that is interested in this sub-channel, * after this sub-channel is setup. */ wakeup(prichan); done: /* * Hook this channel up for later revocation. */ mtx_lock(&sc->vmbus_chan_lock); vmbus_chan_ins_list(sc, newchan); mtx_unlock(&sc->vmbus_chan_lock); if (bootverbose) { vmbus_chan_printf(newchan, "chan%u subidx%u offer\n", newchan->ch_id, newchan->ch_subidx); } /* Select default cpu for this channel. */ vmbus_chan_cpu_default(newchan); return 0; } void vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu) { KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu)); if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 || chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) { /* Only cpu0 is supported */ cpu = 0; } chan->ch_cpuid = cpu; chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu); if (bootverbose) { vmbus_chan_printf(chan, "chan%u assigned to cpu%u [vcpu%u]\n", chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid); } } void vmbus_chan_cpu_rr(struct vmbus_channel *chan) { static uint32_t vmbus_chan_nextcpu; int cpu; cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus; vmbus_chan_cpu_set(chan, cpu); } static void vmbus_chan_cpu_default(struct vmbus_channel *chan) { /* * By default, pin the channel to cpu0. Devices having * special channel-cpu mapping requirement should call * vmbus_chan_cpu_{set,rr}(). */ vmbus_chan_cpu_set(chan, 0); } static void vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_choffer *offer; struct vmbus_channel *chan; task_fn_t *detach_fn, *attach_fn; int error; offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data; chan = vmbus_chan_alloc(sc); if (chan == NULL) { device_printf(sc->vmbus_dev, "allocate chan%u failed\n", offer->chm_chanid); return; } chan->ch_id = offer->chm_chanid; chan->ch_subidx = offer->chm_subidx; chan->ch_guid_type = offer->chm_chtype; chan->ch_guid_inst = offer->chm_chinst; /* Batch reading is on by default */ chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT; if (sc->vmbus_version != VMBUS_VERSION_WS2008) chan->ch_monprm->mp_connid = offer->chm_connid; if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) { int trig_idx; /* * Setup MNF stuffs. */ chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF; trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN; if (trig_idx >= VMBUS_MONTRIGS_MAX) panic("invalid monitor trigger %u", offer->chm_montrig); chan->ch_montrig = &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending; chan->ch_montrig_mask = 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN); } + if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) { + /* This is HyperV socket channel */ + chan->ch_is_hvs = true; + /* The first byte != 0 means the host initiated connection. */ + chan->ch_hvs_conn_from_host = + offer->chm_udata.pipe.user_def[0]; + + if (bootverbose) { + device_printf(sc->vmbus_dev, + "chan%u is hyperv socket channel " + "connected %s host\n", + chan->ch_id, + (chan->ch_hvs_conn_from_host != 0) ? + "from" : "to"); + } + } else { + chan->ch_is_hvs = false; + } + /* * Setup event flag. */ chan->ch_evtflag = &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT]; chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK); /* * Setup attach and detach tasks. */ if (VMBUS_CHAN_ISPRIMARY(chan)) { chan->ch_mgmt_tq = sc->vmbus_devtq; attach_fn = vmbus_prichan_attach_task; detach_fn = vmbus_prichan_detach_task; } else { chan->ch_mgmt_tq = sc->vmbus_subchtq; attach_fn = vmbus_subchan_attach_task; detach_fn = vmbus_subchan_detach_task; } TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan); TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan); error = vmbus_chan_add(chan); if (error) { device_printf(sc->vmbus_dev, "add chan%u failed: %d\n", chan->ch_id, error); atomic_subtract_int(&chan->ch_refs, 1); vmbus_chan_free(chan); return; } taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task); } static void vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_chrescind *note; struct vmbus_channel *chan; note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data; if (note->chm_chanid > VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid revoked chan%u\n", note->chm_chanid); return; } /* * Find and remove the target channel from the channel list. */ mtx_lock(&sc->vmbus_chan_lock); TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { if (chan->ch_id == note->chm_chanid) break; } if (chan == NULL) { mtx_unlock(&sc->vmbus_chan_lock); device_printf(sc->vmbus_dev, "chan%u is not offered\n", note->chm_chanid); return; } vmbus_chan_rem_list(sc, chan); mtx_unlock(&sc->vmbus_chan_lock); if (VMBUS_CHAN_ISPRIMARY(chan)) { /* * The target channel is a primary channel; remove the * target channel from the primary channel list now, * instead of later, so that it will not be found by * other sub-channel offers, which are processed in * this thread. */ mtx_lock(&sc->vmbus_prichan_lock); vmbus_chan_rem_prilist(sc, chan); mtx_unlock(&sc->vmbus_prichan_lock); } /* * NOTE: * The following processing order is critical: * Set the REVOKED state flag before orphaning the installed xact. */ if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_REVOKED_SHIFT)) panic("channel has already been revoked"); sx_xlock(&chan->ch_orphan_lock); if (chan->ch_orphan_xact != NULL) vmbus_xact_ctx_orphan(chan->ch_orphan_xact); sx_xunlock(&chan->ch_orphan_lock); if (bootverbose) vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid); vmbus_chan_detach(chan); } static int vmbus_chan_release(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_chanmsg_chfree *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chfree(chan%u)\n", chan->ch_id); return (ENXIO); } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { vmbus_chan_printf(chan, "chfree(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); } else { if (bootverbose) vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id); } return (error); } static void vmbus_prichan_detach_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(VMBUS_CHAN_ISPRIMARY(chan), ("chan%u is not primary channel", chan->ch_id)); /* Delete and detach the device associated with this channel. */ vmbus_delete_child(chan); /* Release this channel (back to vmbus). */ vmbus_chan_release(chan); /* Free this channel's resource. */ vmbus_chan_free(chan); } static void vmbus_subchan_detach_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; struct vmbus_channel *pri_chan = chan->ch_prichan; KASSERT(!VMBUS_CHAN_ISPRIMARY(chan), ("chan%u is primary channel", chan->ch_id)); /* Release this channel (back to vmbus). */ vmbus_chan_release(chan); /* Unlink from its primary channel's sub-channel list. */ mtx_lock(&pri_chan->ch_subchan_lock); vmbus_chan_rem_sublist(pri_chan, chan); mtx_unlock(&pri_chan->ch_subchan_lock); /* Notify anyone that is waiting for this sub-channel to vanish. */ wakeup(pri_chan); /* Free this channel's resource. */ vmbus_chan_free(chan); } static void vmbus_prichan_attach_task(void *xchan, int pending __unused) { /* * Add device for this primary channel. */ vmbus_add_child(xchan); } static void vmbus_subchan_attach_task(void *xchan __unused, int pending __unused) { /* Nothing */ } void vmbus_chan_destroy_all(struct vmbus_softc *sc) { /* * Detach all devices and destroy the corresponding primary * channels. */ for (;;) { struct vmbus_channel *chan; mtx_lock(&sc->vmbus_chan_lock); TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { if (VMBUS_CHAN_ISPRIMARY(chan)) break; } if (chan == NULL) { /* No more primary channels; done. */ mtx_unlock(&sc->vmbus_chan_lock); break; } vmbus_chan_rem_list(sc, chan); mtx_unlock(&sc->vmbus_chan_lock); mtx_lock(&sc->vmbus_prichan_lock); vmbus_chan_rem_prilist(sc, chan); mtx_unlock(&sc->vmbus_prichan_lock); taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } } struct vmbus_channel ** vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt) { struct vmbus_channel **ret, *chan; int i; KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt)); ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP, M_WAITOK); mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt < subchan_cnt) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0); i = 0; TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) { /* TODO: refcnt chan */ ret[i] = chan; ++i; if (i == subchan_cnt) break; } KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d", pri_chan->ch_subchan_cnt, subchan_cnt)); mtx_unlock(&pri_chan->ch_subchan_lock); return ret; } void vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused) { free(subchan, M_TEMP); } void vmbus_subchan_drain(struct vmbus_channel *pri_chan) { mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt > 0) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0); mtx_unlock(&pri_chan->ch_subchan_lock); } void vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX, ("invalid message type %u", msg_type)); msg_proc = vmbus_chan_msgprocs[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); } void vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on) { if (!on) chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD; else chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; } uint32_t vmbus_chan_id(const struct vmbus_channel *chan) { return chan->ch_id; } uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan) { return chan->ch_subidx; } bool vmbus_chan_is_primary(const struct vmbus_channel *chan) { if (VMBUS_CHAN_ISPRIMARY(chan)) return true; else return false; } -const struct hyperv_guid * -vmbus_chan_guid_inst(const struct vmbus_channel *chan) +bool +vmbus_chan_is_hvs(const struct vmbus_channel *chan) +{ + return chan->ch_is_hvs; +} + +bool +vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan) +{ + KASSERT(vmbus_chan_is_hvs(chan) == true, + ("Not a HyperV Socket channel %u", chan->ch_id)); + if (chan->ch_hvs_conn_from_host != 0) + return true; + else + return false; +} + +struct hyperv_guid * +vmbus_chan_guid_type(struct vmbus_channel *chan) +{ + return &chan->ch_guid_type; +} + +struct hyperv_guid * +vmbus_chan_guid_inst(struct vmbus_channel *chan) { return &chan->ch_guid_inst; } int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max) { int elem_size; elem_size = __offsetof(struct vmbus_chanpkt_prplist, cp_range[0].gpa_page[prpcnt_max]); elem_size += dlen_max; elem_size = VMBUS_CHANPKT_TOTLEN(elem_size); return (vmbus_br_nelem(br_size, elem_size)); } bool vmbus_chan_tx_empty(const struct vmbus_channel *chan) { return (vmbus_txbr_empty(&chan->ch_txbr)); } bool vmbus_chan_rx_empty(const struct vmbus_channel *chan) { return (vmbus_rxbr_empty(&chan->ch_rxbr)); } static int vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...) { va_list ap; device_t dev; int retval; if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev)) dev = chan->ch_vmbus->vmbus_dev; else dev = chan->ch_dev; retval = device_print_prettyname(dev); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task) { taskqueue_enqueue(chan->ch_tq, task); taskqueue_drain(chan->ch_tq, task); } struct taskqueue * vmbus_chan_mgmt_tq(const struct vmbus_channel *chan) { return (chan->ch_mgmt_tq); } bool vmbus_chan_is_revoked(const struct vmbus_channel *chan) { if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED) return (true); return (false); } void vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact) { sx_xlock(&chan->ch_orphan_lock); chan->ch_orphan_xact = xact; sx_xunlock(&chan->ch_orphan_lock); } void vmbus_chan_unset_orphan(struct vmbus_channel *chan) { sx_xlock(&chan->ch_orphan_lock); chan->ch_orphan_xact = NULL; sx_xunlock(&chan->ch_orphan_lock); } const void * vmbus_chan_xact_wait(const struct vmbus_channel *chan, struct vmbus_xact *xact, size_t *resp_len, bool can_sleep) { const void *ret; if (can_sleep) ret = vmbus_xact_wait(xact, resp_len); else ret = vmbus_xact_busywait(xact, resp_len); if (vmbus_chan_is_revoked(chan)) { /* * This xact probably is interrupted, and the * interruption can race the reply reception, * so we have to make sure that there are nothing * left on the RX bufring, i.e. this xact will * not be touched, once this function returns. * * Since the hypervisor will not put more data * onto the RX bufring once the channel is revoked, * the following loop will be terminated, once all * data are drained by the driver's channel * callback. */ while (!vmbus_chan_rx_empty(chan)) { if (can_sleep) pause("chxact", 1); else DELAY(1000); } } return (ret); } void vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz) { struct vmbus_chan_pollarg arg; struct task poll_cfg; KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, ("enable polling on non-batch chan%u", chan->ch_id)); KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN && pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz)); arg.poll_chan = chan; arg.poll_hz = pollhz; TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg); vmbus_chan_run_task(chan, &poll_cfg); } void vmbus_chan_poll_disable(struct vmbus_channel *chan) { struct task poll_dis; KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, ("disable polling on non-batch chan%u", chan->ch_id)); TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan); vmbus_chan_run_task(chan, &poll_dis); } Index: head/sys/dev/hyperv/vmbus/vmbus_chanvar.h =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_chanvar.h (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus_chanvar.h (revision 361275) @@ -1,189 +1,195 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_CHANVAR_H_ #define _VMBUS_CHANVAR_H_ #include #include #include #include #include #include #include #include #include #include #include #include struct vmbus_channel { /* * NOTE: * Fields before ch_txbr are only accessed on this channel's * target CPU. */ uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */ int ch_poll_flags; /* callout flags */ /* * RX bufring; immediately following ch_txbr. */ struct vmbus_rxbr ch_rxbr; struct taskqueue *ch_tq; struct task ch_task; struct task ch_poll_task; sbintime_t ch_poll_intvl; struct callout ch_poll_timeo; vmbus_chan_callback_t ch_cb; void *ch_cbarg; /* * TX bufring; at the beginning of ch_bufring. * * NOTE: * Put TX bufring and the following MNF/evtflag to a new * cacheline, since they will be accessed on all CPUs by * locking ch_txbr first. * * XXX * TX bufring and following MNF/evtflags do _not_ fit in * one 64B cacheline. */ struct vmbus_txbr ch_txbr __aligned(CACHE_LINE_SIZE); uint32_t ch_txflags; /* VMBUS_CHAN_TXF_ */ /* * These are based on the vmbus_chanmsg_choffer.chm_montrig. * Save it here for easy access. */ uint32_t ch_montrig_mask;/* MNF trig mask */ volatile uint32_t *ch_montrig; /* MNF trigger loc. */ /* * These are based on the vmbus_chanmsg_choffer.chm_chanid. * Save it here for easy access. */ u_long ch_evtflag_mask;/* event flag */ volatile u_long *ch_evtflag; /* event flag loc. */ /* * Rarely used fields. */ struct hyperv_mon_param *ch_monprm; struct hyperv_dma ch_monprm_dma; uint32_t ch_id; /* channel id */ device_t ch_dev; struct vmbus_softc *ch_vmbus; int ch_cpuid; /* owner cpu */ /* * Virtual cpuid for ch_cpuid; it is used to communicate cpuid * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not * exist, ch_vcpuid will always be 0 for compatibility. */ uint32_t ch_vcpuid; /* * If this is a primary channel, ch_subchan* fields * contain sub-channels belonging to this primary * channel. */ struct mtx ch_subchan_lock; TAILQ_HEAD(, vmbus_channel) ch_subchans; int ch_subchan_cnt; /* If this is a sub-channel */ TAILQ_ENTRY(vmbus_channel) ch_sublink; /* sub-channel link */ struct vmbus_channel *ch_prichan; /* owner primary chan */ void *ch_bufring; /* TX+RX bufrings */ struct hyperv_dma ch_bufring_dma; uint32_t ch_bufring_gpadl; struct task ch_attach_task; /* run in ch_mgmt_tq */ struct task ch_detach_task; /* run in ch_mgmt_tq */ struct taskqueue *ch_mgmt_tq; /* If this is a primary channel */ TAILQ_ENTRY(vmbus_channel) ch_prilink; /* primary chan link */ TAILQ_ENTRY(vmbus_channel) ch_link; /* channel link */ uint32_t ch_subidx; /* subchan index */ volatile uint32_t ch_stflags; /* atomic-op */ /* VMBUS_CHAN_ST_ */ struct hyperv_guid ch_guid_type; struct hyperv_guid ch_guid_inst; struct sx ch_orphan_lock; struct vmbus_xact_ctx *ch_orphan_xact; int ch_refs; + /* + * These are for HyperV socket channel only + */ + bool ch_is_hvs; + uint8_t ch_hvs_conn_from_host; + struct sysctl_ctx_list ch_sysctl_ctx; } __aligned(CACHE_LINE_SIZE); #define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0) /* * If this flag is set, this channel's interrupt will be masked in ISR, * and the RX bufring will be drained before this channel's interrupt is * unmasked. * * This flag is turned on by default. Drivers can turn it off according * to their own requirement. */ #define VMBUS_CHAN_FLAG_BATCHREAD 0x0002 #define VMBUS_CHAN_TXF_HASMNF 0x0001 #define VMBUS_CHAN_ST_OPENED_SHIFT 0 #define VMBUS_CHAN_ST_ONPRIL_SHIFT 1 #define VMBUS_CHAN_ST_ONSUBL_SHIFT 2 #define VMBUS_CHAN_ST_ONLIST_SHIFT 3 #define VMBUS_CHAN_ST_REVOKED_SHIFT 4 /* sticky */ #define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) #define VMBUS_CHAN_ST_ONPRIL (1 << VMBUS_CHAN_ST_ONPRIL_SHIFT) #define VMBUS_CHAN_ST_ONSUBL (1 << VMBUS_CHAN_ST_ONSUBL_SHIFT) #define VMBUS_CHAN_ST_ONLIST (1 << VMBUS_CHAN_ST_ONLIST_SHIFT) #define VMBUS_CHAN_ST_REVOKED (1 << VMBUS_CHAN_ST_REVOKED_SHIFT) struct vmbus_softc; struct vmbus_message; void vmbus_event_proc(struct vmbus_softc *, int); void vmbus_event_proc_compat(struct vmbus_softc *, int); void vmbus_chan_msgproc(struct vmbus_softc *, const struct vmbus_message *); void vmbus_chan_destroy_all(struct vmbus_softc *); #endif /* !_VMBUS_CHANVAR_H_ */ Index: head/sys/dev/hyperv/vmbus/vmbus_reg.h =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_reg.h (revision 361274) +++ head/sys/dev/hyperv/vmbus/vmbus_reg.h (revision 361275) @@ -1,336 +1,427 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_REG_H_ #define _VMBUS_REG_H_ #include #include /* XXX for hyperv_guid */ #include #include /* * Hyper-V SynIC message format. */ #define VMBUS_MSG_DSIZE_MAX 240 #define VMBUS_MSG_SIZE 256 struct vmbus_message { uint32_t msg_type; /* HYPERV_MSGTYPE_ */ uint8_t msg_dsize; /* data size */ uint8_t msg_flags; /* VMBUS_MSGFLAG_ */ uint16_t msg_rsvd; uint64_t msg_id; uint8_t msg_data[VMBUS_MSG_DSIZE_MAX]; } __packed; CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE); #define VMBUS_MSGFLAG_PENDING 0x01 /* * Hyper-V SynIC event flags */ #ifdef __LP64__ #define VMBUS_EVTFLAGS_MAX 32 #define VMBUS_EVTFLAG_SHIFT 6 #else #define VMBUS_EVTFLAGS_MAX 64 #define VMBUS_EVTFLAG_SHIFT 5 #endif #define VMBUS_EVTFLAG_LEN (1 << VMBUS_EVTFLAG_SHIFT) #define VMBUS_EVTFLAG_MASK (VMBUS_EVTFLAG_LEN - 1) #define VMBUS_EVTFLAGS_SIZE 256 struct vmbus_evtflags { u_long evt_flags[VMBUS_EVTFLAGS_MAX]; } __packed; CTASSERT(sizeof(struct vmbus_evtflags) == VMBUS_EVTFLAGS_SIZE); /* * Hyper-V Monitor Notification Facility */ struct vmbus_mon_trig { uint32_t mt_pending; uint32_t mt_armed; } __packed; #define VMBUS_MONTRIGS_MAX 4 #define VMBUS_MONTRIG_LEN 32 struct vmbus_mnf { uint32_t mnf_state; uint32_t mnf_rsvd1; struct vmbus_mon_trig mnf_trigs[VMBUS_MONTRIGS_MAX]; uint8_t mnf_rsvd2[536]; uint16_t mnf_lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; uint8_t mnf_rsvd3[256]; struct hyperv_mon_param mnf_param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; uint8_t mnf_rsvd4[1984]; } __packed; CTASSERT(sizeof(struct vmbus_mnf) == PAGE_SIZE); /* * Buffer ring */ struct vmbus_bufring { /* * If br_windex == br_rindex, this bufring is empty; this * means we can _not_ write data to the bufring, if the * write is going to make br_windex same as br_rindex. */ volatile uint32_t br_windex; volatile uint32_t br_rindex; /* * Interrupt mask {0,1} * * For TX bufring, host set this to 1, when it is processing * the TX bufring, so that we can safely skip the TX event * notification to host. * * For RX bufring, once this is set to 1 by us, host will not * further dispatch interrupts to us, even if there are data * pending on the RX bufring. This effectively disables the * interrupt of the channel to which this RX bufring is attached. */ volatile uint32_t br_imask; - uint8_t br_rsvd[4084]; + /* + * WS2012/Win8 and later versions of Hyper-V implement interrupt + * driven flow management. The feature bit feat_pending_snd_sz + * is set by the host on the host->guest buffer ring, and by the + * guest on the guest->host buffer ring. + * + * The meaning of the feature bit is a bit complex in that it has + * semantics that apply to both buffer rings. If the guest sets + * the feature bit in the guest->host buffer ring, the guest is + * telling the host that: + * 1) It will set the br_pending_snd_sz field in the guest->host buffer + * ring when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the host->guest + * ring buffer and interrupt the host when it frees enough space + * + * Similarly, if the host sets the feature bit in the host->guest + * ring buffer, the host is telling the guest that: + * 1) It will set the pending_send_sz field in the host->guest ring + * buffer when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the guest->host + * ring buffer and interrupt the guest when it frees enough space + * + * If either the guest or host does not set the feature bit that it + * owns, that guest or host must do polling if it encounters a full + * ring buffer, and not signal the other end with an interrupt. + */ + volatile uint32_t br_pending_snd_sz; + uint32_t br_rsvd1[12]; + union { + struct { + uint32_t feat_pending_snd_sz:1; + }; + uint32_t value; + } br_feature_bits; + + /* Padding to PAGE_SIZE */ + uint8_t br_rsvd2[4020]; + + /* + * Total guest to host interrupt count + * - For rx ring, this counts the guest signaling host when this rx + * ring changing from full to not full. + * + * - For tx ring, this counts the guest signaling host when this tx + * ring changing from empty to non empty. + */ + uint64_t br_g2h_intr_cnt; + uint8_t br_data[]; } __packed; CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE); /* * Channel */ #define VMBUS_CHAN_MAX_COMPAT 256 #define VMBUS_CHAN_MAX (VMBUS_EVTFLAG_LEN * VMBUS_EVTFLAGS_MAX) /* * Channel packets */ #define VMBUS_CHANPKT_SIZE_ALIGN (1 << VMBUS_CHANPKT_SIZE_SHIFT) #define VMBUS_CHANPKT_SETLEN(pktlen, len) \ do { \ (pktlen) = (len) >> VMBUS_CHANPKT_SIZE_SHIFT; \ } while (0) #define VMBUS_CHANPKT_TOTLEN(tlen) \ roundup2((tlen), VMBUS_CHANPKT_SIZE_ALIGN) #define VMBUS_CHANPKT_HLEN_MIN \ (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) struct vmbus_chanpkt { struct vmbus_chanpkt_hdr cp_hdr; } __packed; struct vmbus_chanpkt_sglist { struct vmbus_chanpkt_hdr cp_hdr; uint32_t cp_rsvd; uint32_t cp_gpa_cnt; struct vmbus_gpa cp_gpa[]; } __packed; struct vmbus_chanpkt_prplist { struct vmbus_chanpkt_hdr cp_hdr; uint32_t cp_rsvd; uint32_t cp_range_cnt; struct vmbus_gpa_range cp_range[]; } __packed; /* * Channel messages * - Embedded in vmbus_message.msg_data, e.g. response and notification. * - Embedded in hypercall_postmsg_in.hc_data, e.g. request. */ #define VMBUS_CHANMSG_TYPE_CHOFFER 1 /* NOTE */ #define VMBUS_CHANMSG_TYPE_CHRESCIND 2 /* NOTE */ #define VMBUS_CHANMSG_TYPE_CHREQUEST 3 /* REQ */ #define VMBUS_CHANMSG_TYPE_CHOFFER_DONE 4 /* NOTE */ #define VMBUS_CHANMSG_TYPE_CHOPEN 5 /* REQ */ #define VMBUS_CHANMSG_TYPE_CHOPEN_RESP 6 /* RESP */ #define VMBUS_CHANMSG_TYPE_CHCLOSE 7 /* REQ */ #define VMBUS_CHANMSG_TYPE_GPADL_CONN 8 /* REQ */ #define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN 9 /* REQ */ #define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP 10 /* RESP */ #define VMBUS_CHANMSG_TYPE_GPADL_DISCONN 11 /* REQ */ #define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP 12 /* RESP */ #define VMBUS_CHANMSG_TYPE_CHFREE 13 /* REQ */ #define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */ #define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */ #define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */ -#define VMBUS_CHANMSG_TYPE_MAX 22 +#define VMBUS_CHANMSG_TYPE_17 17 +#define VMBUS_CHANMSG_TYPE_18 18 +#define VMBUS_CHANMSG_TYPE_19 19 +#define VMBUS_CHANMSG_TYPE_20 20 +#define VMBUS_CHANMSG_TYPE_TL_CONN 21 /* REQ */ +#define VMBUS_CHANMSG_TYPE_22 22 +#define VMBUS_CHANMSG_TYPE_TL_RESULT 23 /* RESP */ +#define VMBUS_CHANMSG_TYPE_MAX 24 struct vmbus_chanmsg_hdr { uint32_t chm_type; /* VMBUS_CHANMSG_TYPE_ */ uint32_t chm_rsvd; } __packed; /* VMBUS_CHANMSG_TYPE_CONNECT */ struct vmbus_chanmsg_connect { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_ver; uint32_t chm_rsvd; uint64_t chm_evtflags; uint64_t chm_mnf1; uint64_t chm_mnf2; } __packed; /* VMBUS_CHANMSG_TYPE_CONNECT_RESP */ struct vmbus_chanmsg_connect_resp { struct vmbus_chanmsg_hdr chm_hdr; uint8_t chm_done; } __packed; /* VMBUS_CHANMSG_TYPE_CHREQUEST */ struct vmbus_chanmsg_chrequest { struct vmbus_chanmsg_hdr chm_hdr; } __packed; /* VMBUS_CHANMSG_TYPE_DISCONNECT */ struct vmbus_chanmsg_disconnect { struct vmbus_chanmsg_hdr chm_hdr; } __packed; +/* VMBUS_CHANMSG_TYPE_TL_CONN */ +/* Hyper-V socket guest connect request */ +struct vmbus_chanmsg_tl_connect { + struct vmbus_chanmsg_hdr chm_hdr; + struct hyperv_guid guest_endpoint_id; + struct hyperv_guid host_service_id; +} __packed; + + /* VMBUS_CHANMSG_TYPE_CHOPEN */ struct vmbus_chanmsg_chopen { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; uint32_t chm_openid; uint32_t chm_gpadl; uint32_t chm_vcpuid; uint32_t chm_txbr_pgcnt; #define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE 120 uint8_t chm_udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE]; } __packed; /* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */ struct vmbus_chanmsg_chopen_resp { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; uint32_t chm_openid; uint32_t chm_status; } __packed; /* VMBUS_CHANMSG_TYPE_GPADL_CONN */ struct vmbus_chanmsg_gpadl_conn { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; uint32_t chm_gpadl; uint16_t chm_range_len; uint16_t chm_range_cnt; struct vmbus_gpa_range chm_range; } __packed; #define VMBUS_CHANMSG_GPADL_CONN_PGMAX 26 CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_conn, chm_range.gpa_page[VMBUS_CHANMSG_GPADL_CONN_PGMAX]) <= HYPERCALL_POSTMSGIN_DSIZE_MAX); /* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */ struct vmbus_chanmsg_gpadl_subconn { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_msgno; uint32_t chm_gpadl; uint64_t chm_gpa_page[]; } __packed; #define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX 28 CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_subconn, chm_gpa_page[VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX]) <= HYPERCALL_POSTMSGIN_DSIZE_MAX); /* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */ struct vmbus_chanmsg_gpadl_connresp { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; uint32_t chm_gpadl; uint32_t chm_status; } __packed; /* VMBUS_CHANMSG_TYPE_CHCLOSE */ struct vmbus_chanmsg_chclose { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; } __packed; /* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */ struct vmbus_chanmsg_gpadl_disconn { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; uint32_t chm_gpadl; } __packed; /* VMBUS_CHANMSG_TYPE_CHFREE */ struct vmbus_chanmsg_chfree { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; } __packed; /* VMBUS_CHANMSG_TYPE_CHRESCIND */ struct vmbus_chanmsg_chrescind { struct vmbus_chanmsg_hdr chm_hdr; uint32_t chm_chanid; } __packed; +/* Size of the user defined data buffer for non-pipe offers */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE 120 + +/* Size of the user defined data buffer for pipe offers. */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE 116 + /* VMBUS_CHANMSG_TYPE_CHOFFER */ struct vmbus_chanmsg_choffer { struct vmbus_chanmsg_hdr chm_hdr; struct hyperv_guid chm_chtype; struct hyperv_guid chm_chinst; uint64_t chm_chlat; /* unit: 100ns */ uint32_t chm_chrev; uint32_t chm_svrctx_sz; uint16_t chm_chflags; uint16_t chm_mmio_sz; /* unit: MB */ - uint8_t chm_udata[120]; + + union { + /* Non-pipes */ + struct { + uint8_t user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE]; + } std; + /* + * Pipes: + * For integrated pipe protocol, which is implemented on + * top of standard user-defined data. Pipe clients have + * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for + * their own user. + */ + struct { + uint32_t pipe_mode; + uint8_t + user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE]; + } pipe; + } chm_udata; + uint16_t chm_subidx; uint16_t chm_rsvd; uint32_t chm_chanid; uint8_t chm_montrig; uint8_t chm_flags1; /* VMBUS_CHOFFER_FLAG1_ */ uint16_t chm_flags2; uint32_t chm_connid; } __packed; CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX); + +/* Server Flag */ +#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER 0x2000 #define VMBUS_CHOFFER_FLAG1_HASMNF 0x01 #endif /* !_VMBUS_REG_H_ */ Index: head/sys/modules/hyperv/Makefile =================================================================== --- head/sys/modules/hyperv/Makefile (revision 361274) +++ head/sys/modules/hyperv/Makefile (revision 361275) @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR = vmbus netvsc storvsc utilities +SUBDIR = vmbus netvsc storvsc utilities hvsock .include Index: head/sys/modules/hyperv/hvsock/Makefile =================================================================== --- head/sys/modules/hyperv/hvsock/Makefile (nonexistent) +++ head/sys/modules/hyperv/hvsock/Makefile (revision 361275) @@ -0,0 +1,13 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/dev/hyperv/hvsock + +KMOD= hv_sock +SRCS= hv_sock.c +SRCS+= hv_sock.h + +CFLAGS+= -I${SRCTOP}/sys/dev/hyperv/include \ + -I${SRCTOP}/sys/dev/hyperv/vmbus \ + -I${SRCTOP}/sys/dev/hyperv/hvsock + +.include Property changes on: head/sys/modules/hyperv/hvsock/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/sys/socket.h =================================================================== --- head/sys/sys/socket.h (revision 361274) +++ head/sys/sys/socket.h (revision 361275) @@ -1,732 +1,732 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socket.h 8.4 (Berkeley) 2/21/94 * $FreeBSD$ */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition * to SOCK_CLOEXEC and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x00000001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ #define SO_REUSEADDR 0x00000004 /* allow local address reuse */ #define SO_KEEPALIVE 0x00000008 /* keep connections alive */ #define SO_DONTROUTE 0x00000010 /* just use interface addresses */ #define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x00000080 /* linger on close if data present */ #define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ #define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #define SO_DOMAIN 0x1019 /* get socket domain */ #endif #if __BSD_VISIBLE #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ -#define AF_MAX 42 +#define AF_HYPERV 43 /* HyperV sockets */ +#define AF_MAX 43 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 -#define AF_VENDOR02 43 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials. */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #define SCM_TIME_INFO 0x07 /* timestamp info */ struct sock_timestamp_info { __uint32_t st_info_flags; __uint32_t st_info_pad0; __uint64_t st_info_rsv[7]; }; #define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ #define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted on entrance */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ #define SHUT_RD 0 /* shut down the reading side */ #define SHUT_WR 1 /* shut down the writing side */ #define SHUT_RDWR 2 /* shut down both sides */ #if __BSD_VISIBLE /* for SCTP */ /* we cheat and use the SHUT_XX defines for these */ #define PRU_FLUSH_RD SHUT_RD #define PRU_FLUSH_WR SHUT_WR #define PRU_FLUSH_RDWR SHUT_RDWR #endif #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; #endif /* __BSD_VISIBLE */ #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; struct tcpcb *so_sototcpcb(struct socket *so); struct inpcb *so_sotoinpcb(struct socket *so); struct sockbuf *so_sockbuf_snd(struct socket *); struct sockbuf *so_sockbuf_rcv(struct socket *); int so_state_get(const struct socket *); void so_state_set(struct socket *, int); int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); int so_linger_get(const struct socket *); void so_linger_set(struct socket *, int); struct protosw *so_protosw_get(const struct socket *); void so_protosw_set(struct socket *, struct protosw *); void so_sorwakeup_locked(struct socket *so); void so_sowwakeup_locked(struct socket *so); void so_sorwakeup(struct socket *so); void so_sowwakeup(struct socket *so); void so_lock(struct socket *so); void so_unlock(struct socket *so); #endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */