Index: stable/10/sys/amd64/amd64/apic_vector.S =================================================================== --- stable/10/sys/amd64/amd64/apic_vector.S +++ stable/10/sys/amd64/amd64/apic_vector.S @@ -150,6 +150,22 @@ jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rdi + call hv_vector_handler + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown. Index: stable/10/sys/amd64/conf/GENERIC =================================================================== --- stable/10/sys/amd64/conf/GENERIC +++ stable/10/sys/amd64/conf/GENERIC @@ -346,7 +346,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations Index: stable/10/sys/amd64/conf/NOTES =================================================================== --- stable/10/sys/amd64/conf/NOTES +++ stable/10/sys/amd64/conf/NOTES @@ -479,6 +479,8 @@ device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Microsoft Hyper-V enchancement support +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations Index: stable/10/sys/amd64/include/apicvar.h =================================================================== --- stable/10/sys/amd64/include/apicvar.h +++ stable/10/sys/amd64/include/apicvar.h @@ -216,6 +216,7 @@ void lapic_set_tpr(u_int vector); void lapic_setup(int boot); void xen_intr_handle_upcall(struct trapframe *frame); +void hv_vector_handler(struct trapframe *frame); #endif /* !LOCORE */ #endif /* _MACHINE_APICVAR_H_ */ Index: stable/10/sys/conf/options.amd64 =================================================================== --- stable/10/sys/conf/options.amd64 +++ stable/10/sys/conf/options.amd64 @@ -67,5 +67,7 @@ XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h Index: stable/10/sys/conf/options.i386 =================================================================== --- stable/10/sys/conf/options.i386 +++ stable/10/sys/conf/options.i386 @@ -127,5 +127,7 @@ XEN opt_global.h XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h Index: stable/10/sys/dev/hyperv/include/hyperv.h =================================================================== --- stable/10/sys/dev/hyperv/include/hyperv.h +++ stable/10/sys/dev/hyperv/include/hyperv.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -63,11 +64,22 @@ #define HV_ERROR_MACHINE_LOCKED 0x800704F7 /* - * A revision number of vmbus that is used for ensuring both ends on a - * partition are using compatible versions. - */ + * VMBUS version is 32 bit, upper 16 bit for major_number and lower + * 16 bit for minor_number. + * + * 0.13 -- Windows Server 2008 + * 1.1 -- Windows 7 + * 2.4 -- Windows 8 + * 3.0 -- Windows 8.1 + */ +#define HV_VMBUS_VERSION_WS2008 ((0 << 16) | (13)) +#define HV_VMBUS_VERSION_WIN7 ((1 << 16) | (1)) +#define HV_VMBUS_VERSION_WIN8 ((2 << 16) | (4)) +#define HV_VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) -#define HV_VMBUS_REVISION_NUMBER 13 +#define HV_VMBUS_VERSION_INVALID -1 + +#define HV_VMBUS_VERSION_CURRENT HV_VMBUS_VERSION_WIN8_1 /* * Make maximum size of pipe payload of 16K @@ -112,6 +124,18 @@ unsigned char data[16]; } __packed hv_guid; +#define HV_NIC_GUID \ + .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \ + 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} + +#define HV_IDE_GUID \ + .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, \ + 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} + +#define HV_SCSI_GUID \ + .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, \ + 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} + /* * At the center of the Channel Management library is * the Channel Offer. This struct contains the @@ -147,7 +171,11 @@ } __packed pipe; } u; - uint32_t padding; + /* + * Sub_channel_index, newly added in Win8. + */ + uint16_t sub_channel_index; + uint16_t padding; } __packed hv_vmbus_channel_offer; @@ -344,7 +372,25 @@ hv_vmbus_channel_offer offer; uint32_t child_rel_id; uint8_t monitor_id; - hv_bool_uint8_t monitor_allocated; + /* + * This field has been split into a bit field on Win7 + * and higher. + */ + uint8_t monitor_allocated:1; + uint8_t reserved:7; + /* + * Following fields were added in win7 and higher. + * Make sure to check the version before accessing these fields. + * + * If "is_dedicated_interrupt" is set, we must not set the + * associated bit in the channel bitmap while sending the + * interrupt to the host. + * + * connection_id is used in signaling the host. + */ + uint16_t is_dedicated_interrupt:1; + uint16_t reserved1:15; + uint32_t connection_id; } __packed hv_vmbus_channel_offer_channel; /* @@ -394,9 +440,11 @@ hv_gpadl_handle ring_buffer_gpadl_handle; /* - * GPADL for the channel's server context save area. + * Before win8, all incoming channel interrupts are only + * delivered on cpu 0. Setting this value to 0 would + * preserve the earlier behavior. */ - hv_gpadl_handle server_context_area_gpadl_handle; + uint32_t target_vcpu; /* * The upstream ring buffer begins at offset zero in the memory described @@ -646,14 +694,42 @@ } hv_vmbus_ring_buffer_info; typedef void (*hv_vmbus_pfn_channel_callback)(void *context); +typedef void (*hv_vmbus_sc_creation_callback)(void *context); typedef enum { HV_CHANNEL_OFFER_STATE, HV_CHANNEL_OPENING_STATE, HV_CHANNEL_OPEN_STATE, + HV_CHANNEL_OPENED_STATE, HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE, } hv_vmbus_channel_state; +/* + * Connection identifier type + */ +typedef union { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u; + +} __packed hv_vmbus_connection_id; + +/* + * Definition of the hv_vmbus_signal_event hypercall input structure + */ +typedef struct { + hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +} __packed hv_vmbus_input_signal_event; + +typedef struct { + uint64_t align8; + hv_vmbus_input_signal_event event; +} __packed hv_vmbus_input_signal_event_buffer; + typedef struct hv_vmbus_channel { TAILQ_ENTRY(hv_vmbus_channel) list_entry; struct hv_device* device; @@ -688,8 +764,82 @@ hv_vmbus_pfn_channel_callback on_channel_callback; void* channel_callback_context; + /* + * If batched_reading is set to "true", mask the interrupt + * and read until the channel is empty. + * If batched_reading is set to "false", the channel is not + * going to perform batched reading. + * + * Batched reading is enabled by default; specific + * drivers that don't want this behavior can turn it off. + */ + boolean_t batched_reading; + + boolean_t is_dedicated_interrupt; + + /* + * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall. + */ + hv_vmbus_input_signal_event_buffer signal_event_buffer; + /* + * 8-bytes aligned of the buffer above + */ + hv_vmbus_input_signal_event *signal_event_param; + + /* + * From Win8, this field specifies the target virtual process + * on which to deliver the interupt from the host to guest. + * Before Win8, all channel interrupts would only be + * delivered on cpu 0. Setting this value to 0 would preserve + * the earlier behavior. + */ + uint32_t target_vcpu; + /* The corresponding CPUID in the guest */ + uint32_t target_cpu; + + /* + * Support for multi-channels. + * The initial offer is considered the primary channel and this + * offer message will indicate if the host supports multi-channels. + * The guest is free to ask for multi-channels to be offerred and can + * open these multi-channels as a normal "primary" channel. However, + * all multi-channels will have the same type and instance guids as the + * primary channel. Requests sent on a given channel will result in a + * response on the same channel. + */ + + /* + * Multi-channel creation callback. This callback will be called in + * process context when a Multi-channel offer is received from the host. + * The guest can open the Multi-channel in the context of this callback. + */ + hv_vmbus_sc_creation_callback sc_creation_callback; + + struct mtx sc_lock; + + /* + * Link list of all the multi-channels if this is a primary channel + */ + TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor; + TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry; + + /* + * The primary channel this sub-channle belongs to. + * This will be NULL for the primary channel. + */ + struct hv_vmbus_channel *primary_channel; + /* + * Support per channel state for use by vmbus drivers. + */ + void *per_channel_state; } hv_vmbus_channel; +static inline void +hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state) +{ + channel->batched_reading = state; +} + typedef struct hv_device { hv_guid class_id; hv_guid device_id; @@ -760,6 +910,8 @@ hv_vmbus_channel* channel, uint32_t gpadl_handle); +struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); + /* * Work abstraction defines */ @@ -819,6 +971,7 @@ extern uint8_t* receive_buffer[]; extern hv_vmbus_service service_table[]; +extern uint32_t hv_vmbus_protocal_version; void hv_kvp_callback(void *context); int hv_kvp_init(hv_vmbus_service *serv); Index: stable/10/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c =================================================================== --- stable/10/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ stable/10/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -53,8 +54,12 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include @@ -66,7 +71,6 @@ #include #include - #include #include "hv_vstorage.h" @@ -77,8 +81,29 @@ #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS #define STORVSC_MAX_TARGETS (2) +#define STORVSC_WIN7_MAJOR 4 +#define STORVSC_WIN7_MINOR 2 + +#define STORVSC_WIN8_MAJOR 5 +#define STORVSC_WIN8_MINOR 1 + +#define HV_ALIGN(x, a) roundup2(x, a) + struct storvsc_softc; +struct hv_sgl_node { + LIST_ENTRY(hv_sgl_node) link; + struct sglist *sgl_data; +}; + +struct hv_sgl_page_pool{ + LIST_HEAD(, hv_sgl_node) in_use_sgl_list; + LIST_HEAD(, hv_sgl_node) free_sgl_list; + boolean_t is_init; +} g_hv_sgl_page_pool; + +#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT + enum storvsc_request_type { WRITE_TYPE, READ_TYPE, @@ -96,20 +121,24 @@ struct storvsc_softc *softc; struct callout callout; struct sema synch_sema; /*Synchronize the request/response if needed */ + struct sglist *bounce_sgl; + unsigned int bounce_sgl_count; + uint64_t not_aligned_seg_bits; }; struct storvsc_softc { struct hv_device *hs_dev; - LIST_HEAD(, hv_storvsc_request) hs_free_list; - struct mtx hs_lock; - struct storvsc_driver_props *hs_drv_props; - int hs_unit; - uint32_t hs_frozen; - struct cam_sim *hs_sim; - struct cam_path *hs_path; + LIST_HEAD(, hv_storvsc_request) hs_free_list; + struct mtx hs_lock; + struct storvsc_driver_props *hs_drv_props; + int hs_unit; + uint32_t hs_frozen; + struct cam_sim *hs_sim; + struct cam_path *hs_path; uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; + boolean_t hs_open_multi_channel; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; @@ -124,7 +153,7 @@ * The first can be tested by "sg_senddiag -vv /dev/daX", * and the second and third can be done by * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". - */ + */ #define HVS_TIMEOUT_TEST 0 /* @@ -138,7 +167,7 @@ char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; - uint8_t drv_max_ios_per_target; + uint8_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; @@ -150,6 +179,8 @@ #define HS_MAX_ADAPTERS 10 +#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 + /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ static const hv_guid gStorVscDeviceType={ .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, @@ -171,13 +202,16 @@ STORVSC_RINGBUFFER_SIZE} }; +static int storvsc_current_major; +static int storvsc_current_minor; + /* static functions */ static int storvsc_probe(device_t dev); static int storvsc_attach(device_t dev); static int storvsc_detach(device_t dev); static void storvsc_poll(struct cam_sim * sim); static void storvsc_action(struct cam_sim * sim, union ccb * ccb); -static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); static enum hv_storage_type storvsc_get_storage_type(device_t dev); static void hv_storvsc_on_channel_callback(void *context); @@ -186,6 +220,14 @@ struct hv_storvsc_request *request); static int hv_storvsc_connect_vsp(struct hv_device *device); static void storvsc_io_done(struct hv_storvsc_request *reqp); +static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits); +void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits); static device_method_t storvsc_methods[] = { /* Device interface */ @@ -207,7 +249,7 @@ /** - * The host is capable of sending messages to us that are + * The host is capable of sending messages to us that are * completely unsolicited. So, we need to address the race * condition where we may be in the process of unloading the * driver when the host may send us an unsolicited message. @@ -223,7 +265,7 @@ * destroyed. * * 3. Once the device is marked as being destroyed, we only - * permit incoming traffic to properly account for + * permit incoming traffic to properly account for * packets already sent out. */ static inline struct storvsc_softc * @@ -260,6 +302,113 @@ } /** + * @brief Callback handler, will be invoked when receive mutil-channel offer + * + * @param context new multi-channel + */ +static void +storvsc_handle_sc_creation(void *context) +{ + hv_vmbus_channel *new_channel; + struct hv_device *device; + struct storvsc_softc *sc; + struct vmstor_chan_props props; + int ret = 0; + + new_channel = (hv_vmbus_channel *)context; + device = new_channel->primary_channel->device; + sc = get_stor_device(device, TRUE); + if (sc == NULL) + return; + + if (FALSE == sc->hs_open_multi_channel) + return; + + memset(&props, 0, sizeof(props)); + + ret = hv_vmbus_channel_open(new_channel, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, + new_channel); + + return; +} + +/** + * @brief Send multi-channel creation request to host + * + * @param device a Hyper-V device pointer + * @param max_chans the max channels supported by vmbus + */ +static void +storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) +{ + struct storvsc_softc *sc; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + int request_channels_cnt = 0; + int ret; + + /* get multichannels count that need to create */ + request_channels_cnt = MIN(max_chans, mp_ncpus); + + sc = get_stor_device(dev, TRUE); + if (sc == NULL) { + printf("Storvsc_error: get sc failed while send mutilchannel " + "request\n"); + return; + } + + request = &sc->hs_init_req; + + /* Establish a handler for multi-channel */ + dev->channel->sc_creation_callback = storvsc_handle_sc_creation; + + /* request the host to create multi-channel */ + memset(request, 0, sizeof(struct hv_storvsc_request)); + + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet = &request->vstor_packet; + + vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + vstor_packet->u.multi_channels_cnt = request_channels_cnt; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)(uintptr_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + /* wait for 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) { + printf("Storvsc_error: create multi-channel timeout, %d\n", + ret); + return; + } + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + printf("Storvsc_error: create multi-channel invalid operation " + "(%d) or statue (%u)\n", + vstor_packet->operation, vstor_packet->status); + return; + } + + sc->hs_open_multi_channel = TRUE; + + if (bootverbose) + printf("Storvsc create multi-channel success!\n"); +} + +/** * @brief initialize channel connection to parent partition * * @param dev a Hyper-V device pointer @@ -272,11 +421,15 @@ struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; struct storvsc_softc *sc; + uint16_t max_chans = 0; + boolean_t support_multichannel = FALSE; + + max_chans = 0; + support_multichannel = FALSE; sc = get_stor_device(dev, TRUE); - if (sc == NULL) { - return ENODEV; - } + if (sc == NULL) + return (ENODEV); request = &sc->hs_init_req; memset(request, 0, sizeof(struct hv_storvsc_request)); @@ -300,15 +453,13 @@ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ - - if (ret != 0) { + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { @@ -321,7 +472,8 @@ vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; - vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT; + vstor_packet->u.version.major_minor = + VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor); /* revision is only significant for Windows guests */ vstor_packet->u.version.revision = 0; @@ -334,21 +486,19 @@ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret) { + if (ret) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } /** * Query channel properties @@ -365,22 +515,30 @@ HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if ( ret != 0) { + if ( ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) { goto cleanup; } + /* multi-channels feature is supported by WIN8 and above version */ + max_chans = vstor_packet->u.chan_props.max_channel_cnt; + if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) && + (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) && + (vstor_packet->u.chan_props.flags & + HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { + support_multichannel = TRUE; + } + memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; @@ -397,16 +555,22 @@ goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } + + /* + * If multi-channel is supported, send multichannel create + * request to host. + */ + if (support_multichannel) + storvsc_send_multichannel_request(dev, max_chans); cleanup: sema_destroy(&request->synch_sema); @@ -443,8 +607,7 @@ (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, - dev); - + dev->channel); if (ret != 0) { return ret; @@ -490,7 +653,7 @@ goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); /* KYS 5 seconds */ if (ret) { goto cleanup; @@ -498,7 +661,7 @@ /* - * At this point, all outstanding requests in the adapter + * At this point, all outstanding requests in the adapter * should have been flushed out and return to us */ @@ -521,6 +684,7 @@ { struct storvsc_softc *sc; struct vstor_packet *vstor_packet = &request->vstor_packet; + struct hv_vmbus_channel* outgoing_channel = NULL; int ret = 0; sc = get_stor_device(device, TRUE); @@ -539,19 +703,20 @@ vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; + outgoing_channel = vmbus_select_outgoing_channel(device->channel); mtx_unlock(&request->softc->hs_lock); if (request->data_buf.length) { ret = hv_vmbus_channel_send_packet_multipagebuffer( - device->channel, + outgoing_channel, &request->data_buf, - vstor_packet, - sizeof(struct vstor_packet), + vstor_packet, + sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request); } else { ret = hv_vmbus_channel_send_packet( - device->channel, + outgoing_channel, vstor_packet, sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request, @@ -610,7 +775,8 @@ hv_storvsc_on_channel_callback(void *context) { int ret = 0; - struct hv_device *device = (struct hv_device *)context; + hv_vmbus_channel *channel = (hv_vmbus_channel *)context; + struct hv_device *device = NULL; struct storvsc_softc *sc; uint32_t bytes_recvd; uint64_t request_id; @@ -618,15 +784,22 @@ struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; + if (channel->primary_channel != NULL){ + device = channel->primary_channel->device; + } else { + device = channel->device; + } + + KASSERT(device, ("device is NULL")); + sc = get_stor_device(device, FALSE); if (sc == NULL) { + printf("Storvsc_error: get stor device failed.\n"); return; } - KASSERT(device, ("device")); - ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -634,21 +807,28 @@ while ((ret == 0) && (bytes_recvd > 0)) { request = (struct hv_storvsc_request *)(uintptr_t)request_id; - KASSERT(request, ("request")); if ((request == &sc->hs_init_req) || (request == &sc->hs_reset_req)) { memcpy(&request->vstor_packet, packet, sizeof(struct vstor_packet)); - sema_post(&request->synch_sema); + sema_post(&request->synch_sema); } else { vstor_packet = (struct vstor_packet *)packet; switch(vstor_packet->operation) { case VSTOR_OPERATION_COMPLETEIO: + if (request == NULL) + panic("VMBUS: storvsc received a " + "packet with NULL request id in " + "COMPLETEIO operation."); + hv_storvsc_on_iocompletion(sc, vstor_packet, request); break; case VSTOR_OPERATION_REMOVEDEVICE: + case VSTOR_OPERATION_ENUMERATE_BUS: + printf("VMBUS: storvsc operation %d not " + "implemented.\n", vstor_packet->operation); /* TODO: implement */ break; default: @@ -656,7 +836,7 @@ } } ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -680,7 +860,16 @@ { int ata_disk_enable = 0; int ret = ENXIO; - + + if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) || + (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){ + storvsc_current_major = STORVSC_WIN8_MAJOR; + storvsc_current_minor = STORVSC_WIN8_MINOR; + } else { + storvsc_current_major = STORVSC_WIN7_MAJOR; + storvsc_current_minor = STORVSC_WIN7_MINOR; + } + switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) @@ -721,9 +910,11 @@ enum hv_storage_type stor_type; struct storvsc_softc *sc; struct cam_devq *devq; - int ret, i; + int ret, i, j; struct hv_storvsc_request *reqp; struct root_hold_token *root_mount_token = NULL; + struct hv_sgl_node *sgl_node = NULL; + void *tmp_buff = NULL; /* * We need to serialize storvsc attach calls. @@ -764,8 +955,41 @@ LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } + /* create sg-list page pool */ + if (FALSE == g_hv_sgl_page_pool.is_init) { + g_hv_sgl_page_pool.is_init = TRUE; + LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); + + /* + * Pre-create SG list, each SG list with + * HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each + * segment has one page buffer + */ + for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) { + sgl_node = malloc(sizeof(struct hv_sgl_node), + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data = + sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT, + M_WAITOK|M_ZERO); + + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + tmp_buff = malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data->sg_segs[j].ss_paddr = + (vm_paddr_t)tmp_buff; + } + + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, + sgl_node, link); + } + } + sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; + sc->hs_open_multi_channel = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(hv_dev); @@ -834,6 +1058,20 @@ LIST_REMOVE(reqp, link); free(reqp, M_DEVBUF); } + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (ret); } @@ -853,6 +1091,8 @@ struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp = NULL; struct hv_device *hv_device = vmbus_get_devctx(dev); + struct hv_sgl_node *sgl_node = NULL; + int j = 0; mtx_lock(&hv_device->channel->inbound_lock); sc->hs_destroy = TRUE; @@ -884,6 +1124,20 @@ free(reqp, M_DEVBUF); } mtx_unlock(&sc->hs_lock); + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (0); } @@ -939,7 +1193,7 @@ ticks, __func__, (ret == 0)? "IO return detected" : "IO return not detected"); - /* + /* * Now both the timer handler and io done are running * simultaneously. We want to confirm the io done always * finishes after the timer handler exits. So reqp used by @@ -1023,7 +1277,7 @@ mtx_assert(&sc->hs_lock, MA_OWNED); mtx_unlock(&sc->hs_lock); - hv_storvsc_on_channel_callback(sc->hs_dev); + hv_storvsc_on_channel_callback(sc->hs_dev->channel); mtx_lock(&sc->hs_lock); } @@ -1151,9 +1405,13 @@ bzero(reqp, sizeof(struct hv_storvsc_request)); reqp->softc = sc; - - ccb->ccb_h.status |= CAM_SIM_QUEUED; - create_storvsc_request(ccb, reqp); + + ccb->ccb_h.status |= CAM_SIM_QUEUED; + if ((res = create_storvsc_request(ccb, reqp)) != 0) { + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_init(&reqp->callout, CALLOUT_MPSAFE); @@ -1194,6 +1452,212 @@ } /** + * @brief destroy bounce buffer + * + * This function is responsible for destroy a Scatter/Gather list + * that create by storvsc_create_bounce_buffer() + * + * @param sgl- the Scatter/Gather need be destroy + * @param sg_count- page count of the SG list. + * + */ +static void +storvsc_destroy_bounce_buffer(struct sglist *sgl) +{ + struct hv_sgl_node *sgl_node = NULL; + + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough in use sgl\n"); + return; + } + sgl_node->sgl_data = sgl; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); +} + +/** + * @brief create bounce buffer + * + * This function is responsible for create a Scatter/Gather list, + * which hold several pages that can be aligned with page size. + * + * @param seg_count- SG-list segments count + * @param write - if WRITE_TYPE, set SG list page used size to 0, + * otherwise set used size to page size. + * + * return NULL if create failed + */ +static struct sglist * +storvsc_create_bounce_buffer(uint16_t seg_count, int write) +{ + int i = 0; + struct sglist *bounce_sgl = NULL; + unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); + struct hv_sgl_node *sgl_node = NULL; + + /* get struct sglist from free_sgl_list */ + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough free sgl\n"); + return NULL; + } + bounce_sgl = sgl_node->sgl_data; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); + + bounce_sgl->sg_maxseg = seg_count; + + if (write == WRITE_TYPE) + bounce_sgl->sg_nseg = 0; + else + bounce_sgl->sg_nseg = seg_count; + + for (i = 0; i < seg_count; i++) + bounce_sgl->sg_segs[i].ss_len = buf_len; + + return bounce_sgl; +} + +/** + * @brief copy data from SG list to bounce buffer + * + * This function is responsible for copy data from one SG list's segments + * to another SG list which used as bounce buffer. + * + * @param bounce_sgl - the destination SG list + * @param orig_sgl - the segment of the source SG list. + * @param orig_sgl_count - the count of segments. + * @param orig_sgl_count - indicate which segment need bounce buffer, + * set 1 means need. + * + */ +static void +storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits) +{ + int src_sgl_idx = 0; + + for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { + if (seg_bits & (1 << src_sgl_idx)) { + memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, + (void*)orig_sgl[src_sgl_idx].ds_addr, + orig_sgl[src_sgl_idx].ds_len); + + bounce_sgl->sg_segs[src_sgl_idx].ss_len = + orig_sgl[src_sgl_idx].ds_len; + } + } +} + +/** + * @brief copy data from SG list which used as bounce to another SG list + * + * This function is responsible for copy data from one SG list with bounce + * buffer to another SG list's segments. + * + * @param dest_sgl - the destination SG list's segments + * @param dest_sgl_count - the count of destination SG list's segment. + * @param src_sgl - the source SG list. + * @param seg_bits - indicate which segment used bounce buffer of src SG-list. + * + */ +void +storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits) +{ + int sgl_idx = 0; + + for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { + if (seg_bits & (1 << sgl_idx)) { + memcpy((void*)(dest_sgl[sgl_idx].ds_addr), + (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), + src_sgl->sg_segs[sgl_idx].ss_len); + } + } +} + +/** + * @brief check SG list with bounce buffer or not + * + * This function is responsible for check if need bounce buffer for SG list. + * + * @param sgl - the SG list's segments + * @param sg_count - the count of SG list's segment. + * @param bits - segmengs number that need bounce buffer + * + * return -1 if SG list needless bounce buffer + */ +static int +storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, + unsigned int sg_count, + uint64_t *bits) +{ + int i = 0; + int offset = 0; + uint64_t phys_addr = 0; + uint64_t tmp_bits = 0; + boolean_t found_hole = FALSE; + boolean_t pre_aligned = TRUE; + + if (sg_count < 2){ + return -1; + } + + *bits = 0; + + phys_addr = vtophys(sgl[0].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset != 0) { + pre_aligned = FALSE; + tmp_bits |= 1; + } + + for (i = 1; i < sg_count; i++) { + phys_addr = vtophys(sgl[i].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset == 0) { + if (FALSE == pre_aligned){ + /* + * This segment is aligned, if the previous + * one is not aligned, find a hole + */ + found_hole = TRUE; + } + pre_aligned = TRUE; + } else { + tmp_bits |= 1 << i; + if (!pre_aligned) { + if (phys_addr != vtophys(sgl[i-1].ds_addr + + sgl[i-1].ds_len)) { + /* + * Check whether connect to previous + * segment,if not, find the hole + */ + found_hole = TRUE; + } + } else { + found_hole = TRUE; + } + pre_aligned = FALSE; + } + } + + if (!found_hole) { + return (-1); + } else { + *bits = tmp_bits; + return 0; + } +} + +/** * @brief Fill in a request structure based on a CAM control block * * Fills in a request structure based on the contents of a CAM control @@ -1203,7 +1667,7 @@ * @param ccb pointer to a CAM contorl block * @param reqp pointer to a request structure */ -static void +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) { struct ccb_scsiio *csio = &ccb->csio; @@ -1211,6 +1675,7 @@ uint32_t bytes_to_copy = 0; uint32_t pfn_num = 0; uint32_t pfn; + uint64_t not_aligned_seg_bits = 0; /* refer to struct vmscsi_req for meanings of these two fields */ reqp->vstor_packet.u.vm_srb.port = @@ -1231,48 +1696,172 @@ } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { - case CAM_DIR_OUT: - reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; - break; - case CAM_DIR_IN: - reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; - break; - case CAM_DIR_NONE: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; - default: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; + case CAM_DIR_OUT: + reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + break; + case CAM_DIR_IN: + reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; + break; + case CAM_DIR_NONE: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; + default: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; } reqp->sense_data = &csio->sense_data; reqp->sense_info_len = csio->sense_len; reqp->ccb = ccb; - /* - KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0, - ("ccb is scatter gather valid\n")); - */ - if (csio->dxfer_len != 0) { - reqp->data_buf.length = csio->dxfer_len; + + if (0 == csio->dxfer_len) { + return (0); + } + + reqp->data_buf.length = csio->dxfer_len; + + switch (ccb->ccb_h.flags & CAM_DATA_MASK) { + case CAM_DATA_VADDR: + { bytes_to_copy = csio->dxfer_len; phys_addr = vtophys(csio->data_ptr); - reqp->data_buf.offset = phys_addr - trunc_page(phys_addr); + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + while (bytes_to_copy != 0) { + int bytes, page_offset; + phys_addr = + vtophys(&csio->data_ptr[reqp->data_buf.length - + bytes_to_copy]); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[pfn_num] = pfn; + page_offset = phys_addr & PAGE_MASK; + + bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + + bytes_to_copy -= bytes; + pfn_num++; + } + break; } - while (bytes_to_copy != 0) { - int bytes, page_offset; - phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length - - bytes_to_copy]); - pfn = phys_addr >> PAGE_SHIFT; - reqp->data_buf.pfn_array[pfn_num] = pfn; - page_offset = phys_addr - trunc_page(phys_addr); + case CAM_DATA_SG: + { + int i = 0; + int offset = 0; + int ret; + + bus_dma_segment_t *storvsc_sglist = + (bus_dma_segment_t *)ccb->csio.data_ptr; + u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; + + printf("Storvsc: get SG I/O operation, %d\n", + reqp->vstor_packet.u.vm_srb.data_in); + + if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){ + printf("Storvsc: %d segments is too much, " + "only support %d segments\n", + storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT); + return (EINVAL); + } + + /* + * We create our own bounce buffer function currently. Idealy + * we should use BUS_DMA(9) framework. But with current BUS_DMA + * code there is no callback API to check the page alignment of + * middle segments before busdma can decide if a bounce buffer + * is needed for particular segment. There is callback, + * "bus_dma_filter_t *filter", but the parrameters are not + * sufficient for storvsc driver. + * TODO: + * Add page alignment check in BUS_DMA(9) callback. Once + * this is complete, switch the following code to use + * BUS_DMA(9) for storvsc bounce buffer support. + */ + /* check if we need to create bounce buffer */ + ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, + storvsc_sg_count, ¬_aligned_seg_bits); + if (ret != -1) { + reqp->bounce_sgl = + storvsc_create_bounce_buffer(storvsc_sg_count, + reqp->vstor_packet.u.vm_srb.data_in); + if (NULL == reqp->bounce_sgl) { + printf("Storvsc_error: " + "create bounce buffer failed.\n"); + return (ENOMEM); + } + + reqp->bounce_sgl_count = storvsc_sg_count; + reqp->not_aligned_seg_bits = not_aligned_seg_bits; + + /* + * if it is write, we need copy the original data + *to bounce buffer + */ + if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_sgl_to_bounce_buf( + reqp->bounce_sgl, + storvsc_sglist, + storvsc_sg_count, + reqp->not_aligned_seg_bits); + } + + /* transfer virtual address to physical frame number */ + if (reqp->not_aligned_seg_bits & 0x1){ + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); + }else{ + phys_addr = + vtophys(storvsc_sglist[0].ds_addr); + } + reqp->data_buf.offset = phys_addr & PAGE_MASK; - bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[0] = pfn; + + for (i = 1; i < storvsc_sg_count; i++) { + if (reqp->not_aligned_seg_bits & (1 << i)) { + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); + } else { + phys_addr = + vtophys(storvsc_sglist[i].ds_addr); + } + + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + } else { + phys_addr = vtophys(storvsc_sglist[0].ds_addr); - bytes_to_copy -= bytes; - pfn_num++; + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + for (i = 0; i < storvsc_sg_count; i++) { + phys_addr = vtophys(storvsc_sglist[i].ds_addr); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + + /* check the last segment cross boundary or not */ + offset = phys_addr & PAGE_MASK; + if (offset) { + phys_addr = + vtophys(storvsc_sglist[i-1].ds_addr + + PAGE_SIZE - offset); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + + reqp->bounce_sgl_count = 0; + } + break; + } + default: + printf("Unknow flags: %d\n", ccb->ccb_h.flags); + return(EINVAL); } + + return(0); } /** @@ -1291,7 +1880,29 @@ struct ccb_scsiio *csio = &ccb->csio; struct storvsc_softc *sc = reqp->softc; struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; - + bus_dma_segment_t *ori_sglist = NULL; + int ori_sg_count = 0; + + /* destroy bounce buffer if it is used */ + if (reqp->bounce_sgl_count) { + ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; + ori_sg_count = ccb->csio.sglist_cnt; + + /* + * If it is READ operation, we should copy back the data + * to original SG list. + */ + if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, + ori_sg_count, + reqp->bounce_sgl, + reqp->not_aligned_seg_bits); + } + + storvsc_destroy_bounce_buffer(reqp->bounce_sgl); + reqp->bounce_sgl_count = 0; + } + if (reqp->retries > 0) { mtx_lock(&sc->hs_lock); #if HVS_TIMEOUT_TEST @@ -1309,7 +1920,7 @@ mtx_unlock(&sc->hs_lock); } - /* + /* * callout_drain() will wait for the timer handler to finish * if it is running. So we don't need any lock to synchronize * between this routine and the timer handler. Index: stable/10/sys/dev/hyperv/storvsc/hv_vstorage.h =================================================================== --- stable/10/sys/dev/hyperv/storvsc/hv_vstorage.h +++ stable/10/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -53,7 +53,7 @@ * V1 RC > 2008/1/31 2.0 */ -#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0) +#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) /** * Packet structure ops describing virtual storage requests. @@ -69,7 +69,10 @@ VSTOR_OPERATION_ENDINITIALIZATION = 8, VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, VSTOR_OPERATION_QUERYPROPERTIES = 10, - VSTOR_OPERATION_MAXIMUM = 10 + VSTOR_OPERATION_ENUMERATE_BUS = 11, + VSTOR_OPERATION_FCHBA_DATA = 12, + VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, + VSTOR_OPERATION_MAXIMUM = 13 }; @@ -123,10 +126,12 @@ uint8_t path_id; uint8_t target_id; + uint16_t max_channel_cnt; + /** * Note: port number is only really known on the client side */ - uint32_t port; + uint16_t port; uint32_t flags; uint32_t max_transfer_bytes; @@ -193,6 +198,11 @@ * Used during version negotiations. */ struct vmstor_proto_ver version; + + /** + * Number of multichannels to create + */ + uint16_t multi_channels_cnt; } u; } __packed; Index: stable/10/sys/dev/hyperv/utilities/hv_kvp.c =================================================================== --- stable/10/sys/dev/hyperv/utilities/hv_kvp.c +++ stable/10/sys/dev/hyperv/utilities/hv_kvp.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -232,7 +233,7 @@ */ if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) { icframe_vercnt = 3; - if (icmsg_vercnt >= 2) + if (icmsg_vercnt > 2) icmsg_vercnt = 4; else icmsg_vercnt = 3; @@ -734,8 +735,8 @@ recvlen = 0; ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, &recvlen, &requestid); - hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n", - __func__, context, pending_cnt, ret, recvlen); + hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n", + __func__, context, (unsigned long long)pending_cnt, ret, recvlen); } } @@ -813,9 +814,9 @@ hv_kvp_dev_destroy(void) { - if (daemon_task != NULL) { + if (daemon_task != NULL) { PROC_LOCK(daemon_task); - kern_psignal(daemon_task, SIGKILL); + kern_psignal(daemon_task, SIGKILL); PROC_UNLOCK(daemon_task); } Index: stable/10/sys/dev/hyperv/utilities/hv_util.c =================================================================== --- stable/10/sys/dev/hyperv/utilities/hv_util.c +++ stable/10/sys/dev/hyperv/utilities/hv_util.c @@ -408,6 +408,15 @@ } } + /* + * These services are not performance critical and do not need + * batched reading. Furthermore, some services such as KVP can + * only handle one message from the host at a time. + * Turn off batched reading for all util drivers before we open the + * channel. + */ + hv_set_channel_read_state(hv_dev->channel, FALSE); + ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0, service->callback, hv_dev->channel); Index: stable/10/sys/dev/hyperv/vmbus/hv_channel.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_channel.c +++ stable/10/sys/dev/hyperv/vmbus/hv_channel.c @@ -75,7 +75,7 @@ (uint32_t *)&monitor_page-> trigger_group[channel->monitor_group].u.pending); } else { - hv_vmbus_set_event(channel->offer_msg.child_rel_id); + hv_vmbus_set_event(channel); } } @@ -99,6 +99,18 @@ hv_vmbus_channel_open_channel* open_msg; hv_vmbus_channel_msg_info* open_info; + mtx_lock(&new_channel->sc_lock); + if (new_channel->state == HV_CHANNEL_OPEN_STATE) { + new_channel->state = HV_CHANNEL_OPENING_STATE; + } else { + mtx_unlock(&new_channel->sc_lock); + if(bootverbose) + printf("VMBUS: Trying to open channel <%p> which in " + "%d state.\n", new_channel, new_channel->state); + return (EINVAL); + } + mtx_unlock(&new_channel->sc_lock); + new_channel->on_channel_callback = pfn_on_channel_callback; new_channel->channel_callback_context = context; @@ -162,7 +174,7 @@ new_channel->ring_buffer_gpadl_handle; open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size >> PAGE_SHIFT; - open_msg->server_context_area_gpadl_handle = 0; + open_msg->target_vcpu = new_channel->target_vcpu; if (user_data_len) memcpy(open_msg->user_data, user_data, user_data_len); @@ -182,10 +194,14 @@ ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ - if (ret) + if (ret) { + if(bootverbose) + printf("VMBUS: channel <%p> open timeout.\n", new_channel); goto cleanup; + } if (open_info->response.open_result.status == 0) { + new_channel->state = HV_CHANNEL_OPENED_STATE; if(bootverbose) printf("VMBUS: channel <%p> open success.\n", new_channel); } else { @@ -497,16 +513,20 @@ return (ret); } -/** - * @brief Close the specified channel - */ -void -hv_vmbus_channel_close(hv_vmbus_channel *channel) +static void +hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) { int ret = 0; hv_vmbus_channel_close_channel* msg; hv_vmbus_channel_msg_info* info; + channel->state = HV_CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + + /* + * Grab the lock to prevent race condition when a packet received + * and unloading driver is in the process. + */ mtx_lock(&channel->inbound_lock); channel->on_channel_callback = NULL; mtx_unlock(&channel->inbound_lock); @@ -545,23 +565,37 @@ M_DEVBUF); free(info, M_DEVBUF); +} - /* - * If we are closing the channel during an error path in - * opening the channel, don't free the channel - * since the caller will free the channel - */ - if (channel->state == HV_CHANNEL_OPEN_STATE) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); +/** + * @brief Close the specified channel + */ +void +hv_vmbus_channel_close(hv_vmbus_channel *channel) +{ + hv_vmbus_channel* sub_channel; - hv_vmbus_free_vmbus_channel(channel); + if (channel->primary_channel != NULL) { + /* + * We only close multi-channels when the primary is + * closed. + */ + return; } + /* + * Close all multi-channels first. + */ + TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor, + sc_list_entry) { + if (sub_channel->state != HV_CHANNEL_OPENED_STATE) + continue; + hv_vmbus_channel_close_internal(sub_channel); + } + /* + * Then close the primary channel. + */ + hv_vmbus_channel_close_internal(channel); } /** @@ -581,6 +615,7 @@ uint32_t packet_len; uint64_t aligned_data; uint32_t packet_len_aligned; + boolean_t need_sig; hv_vmbus_sg_buffer_list buffer_list[3]; packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; @@ -604,12 +639,11 @@ buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 - && !hv_vmbus_get_ring_buffer_interrupt_mask( - &channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -632,6 +666,7 @@ int ret = 0; int i = 0; + boolean_t need_sig; uint32_t packet_len; uint32_t packetLen_aligned; hv_vmbus_sg_buffer_list buffer_list[3]; @@ -675,11 +710,11 @@ buffer_list[2].data = &alignedData; buffer_list[2].length = packetLen_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -700,6 +735,7 @@ int ret = 0; uint32_t desc_size; + boolean_t need_sig; uint32_t packet_len; uint32_t packet_len_aligned; uint32_t pfn_count; @@ -750,11 +786,11 @@ buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } Index: stable/10/sys/dev/hyperv/vmbus/hv_channel_mgmt.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ stable/10/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -26,6 +26,9 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +__FBSDID("$FreeBSD$"); + #include #include @@ -50,6 +53,8 @@ static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_process_offer(void *context); +struct hv_vmbus_channel* + vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); /** * Channel message dispatch table @@ -233,6 +238,9 @@ return (NULL); mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); + mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); + + TAILQ_INIT(&channel->sc_list_anchor); channel->control_work_queue = hv_work_queue_create("control"); @@ -262,6 +270,7 @@ void hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { + mtx_destroy(&channel->sc_lock); mtx_destroy(&channel->inbound_lock); /* * We have to release the channel's workqueue/thread in @@ -279,10 +288,10 @@ static void vmbus_channel_process_offer(void *context) { - int ret; hv_vmbus_channel* new_channel; boolean_t f_new; hv_vmbus_channel* channel; + int ret; new_channel = (hv_vmbus_channel*) context; f_new = TRUE; @@ -291,38 +300,76 @@ /* * Make sure this is a new offer */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { - if (!memcmp( - &channel->offer_msg.offer.interface_type, - &new_channel->offer_msg.offer.interface_type, - sizeof(hv_guid)) - && !memcmp( - &channel->offer_msg.offer.interface_instance, + if (memcmp(&channel->offer_msg.offer.interface_type, + &new_channel->offer_msg.offer.interface_type, + sizeof(hv_guid)) == 0 && + memcmp(&channel->offer_msg.offer.interface_instance, &new_channel->offer_msg.offer.interface_instance, - sizeof(hv_guid))) { - f_new = FALSE; - break; - } + sizeof(hv_guid)) == 0) { + f_new = FALSE; + break; + } } if (f_new) { - /* Insert at tail */ - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + /* Insert at tail */ + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + /*XXX add new channel to percpu_list */ if (!f_new) { + /* + * Check if this is a sub channel. + */ + if (new_channel->offer_msg.offer.sub_channel_index != 0) { + /* + * It is a sub channel offer, process it. + */ + new_channel->primary_channel = channel; + mtx_lock(&channel->sc_lock); + TAILQ_INSERT_TAIL( + &channel->sc_list_anchor, + new_channel, + sc_list_entry); + mtx_unlock(&channel->sc_lock); + + /* Insert new channel into channel_anchor. */ + printf("Storvsc get multi-channel offer, rel=%u.\n", + new_channel->offer_msg.child_rel_id); + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + if(bootverbose) + printf("VMBUS: new multi-channel offer <%p>.\n", + new_channel); + + /*XXX add it to percpu_list */ + + new_channel->state = HV_CHANNEL_OPEN_STATE; + if (channel->sc_creation_callback != NULL) { + channel->sc_creation_callback(new_channel); + } + return; + } + hv_vmbus_free_vmbus_channel(new_channel); return; } + new_channel->state = HV_CHANNEL_OPEN_STATE; + /* * Start the process of binding this offer to the driver * (We need to set the device field before calling @@ -333,35 +380,86 @@ new_channel->offer_msg.offer.interface_instance, new_channel); /* - * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below - * but in the "open" channel request. The ret != 0 logic below - * doesn't take into account that a channel - * may have been opened successfully - */ - - /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() * method. */ ret = hv_vmbus_child_device_register(new_channel->device); if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - hv_vmbus_free_vmbus_channel(new_channel); - } else { - /* - * This state is used to indicate a successful open - * so that when we do close the channel normally, - * we can clean up properly - */ - new_channel->state = HV_CHANNEL_OPEN_STATE; + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_free_vmbus_channel(new_channel); + } +} + +/** + * Array of device guids that are performance critical. We try to distribute + * the interrupt load for these devices across all online cpus. + */ +static const hv_guid high_perf_devices[] = { + {HV_NIC_GUID, }, + {HV_IDE_GUID, }, + {HV_SCSI_GUID, }, +}; + +enum { + PERF_CHN_NIC = 0, + PERF_CHN_IDE, + PERF_CHN_SCSI, + MAX_PERF_CHN, +}; +/* + * We use this static number to distribute the channel interrupt load. + */ +static uint32_t next_vcpu; + +/** + * Starting with Win8, we can statically distribute the incoming + * channel interrupt load by binding a channel to VCPU. We + * implement here a simple round robin scheme for distributing + * the interrupt load. + * We will bind channels that are not performance critical to cpu 0 and + * performance critical channels (IDE, SCSI and Network) will be uniformly + * distributed across all available CPUs. + */ +static void +vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) +{ + uint32_t current_cpu; + int i; + boolean_t is_perf_channel = FALSE; + + for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { + if (memcmp(guid->data, high_perf_devices[i].data, + sizeof(hv_guid)) == 0) { + is_perf_channel = TRUE; + break; + } } + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || + (!is_perf_channel)) { + /* Host's view of guest cpu */ + channel->target_vcpu = 0; + /* Guest's own view of cpu */ + channel->target_cpu = 0; + return; + } + /* mp_ncpus should have the number cpus currently online */ + current_cpu = (++next_vcpu % mp_ncpus); + channel->target_cpu = current_cpu; + channel->target_vcpu = + hv_vmbus_g_context.hv_vcpu_index[current_cpu]; + if (bootverbose) + printf("VMBUS: Total online cpus %d, assign perf channel %d " + "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, + current_cpu); } /** @@ -391,6 +489,38 @@ if (new_channel == NULL) return; + /* + * By default we setup state to enable batched + * reading. A specific service can choose to + * disable this prior to opening the channel. + */ + new_channel->batched_reading = TRUE; + + new_channel->signal_event_param = + (hv_vmbus_input_signal_event *) + (HV_ALIGN_UP((unsigned long) + &new_channel->signal_event_buffer, + HV_HYPERCALL_PARAM_ALIGN)); + + new_channel->signal_event_param->connection_id.as_uint32_t = 0; + new_channel->signal_event_param->connection_id.u.id = + HV_VMBUS_EVENT_CONNECTION_ID; + new_channel->signal_event_param->flag_number = 0; + new_channel->signal_event_param->rsvd_z = 0; + + if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) { + new_channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + new_channel->signal_event_param->connection_id.u.id = + offer->connection_id; + } + + /* + * Bind the channel to a chosen cpu. + */ + vmbus_channel_select_cpu(new_channel, + &offer->offer.interface_type); + memcpy(&new_channel->offer_msg, offer, sizeof(hv_vmbus_channel_offer_channel)); new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; @@ -666,7 +796,7 @@ { hv_vmbus_channel *channel; - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) { channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor); @@ -676,5 +806,61 @@ hv_vmbus_child_device_unregister(channel->device); hv_vmbus_free_vmbus_channel(channel); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); +} + +/** + * @brief Select the best outgoing channel + * + * The channel whose vcpu binding is closest to the currect vcpu will + * be selected. + * If no multi-channel, always select primary channel + * + * @param primary - primary channel + */ +struct hv_vmbus_channel * +vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) +{ + hv_vmbus_channel *new_channel = NULL; + hv_vmbus_channel *outgoing_channel = primary; + int old_cpu_distance = 0; + int new_cpu_distance = 0; + int cur_vcpu = 0; + int smp_pro_id = PCPU_GET(cpuid); + + if (TAILQ_EMPTY(&primary->sc_list_anchor)) { + return outgoing_channel; + } + + if (smp_pro_id >= MAXCPU) { + return outgoing_channel; + } + + cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id]; + + TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) { + if (new_channel->state != HV_CHANNEL_OPENED_STATE){ + continue; + } + + if (new_channel->target_vcpu == cur_vcpu){ + return new_channel; + } + + old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ? + (outgoing_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - outgoing_channel->target_vcpu)); + + new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ? + (new_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - new_channel->target_vcpu)); + + if (old_cpu_distance < new_cpu_distance) { + continue; + } + + outgoing_channel = new_channel; + } + + return(outgoing_channel); } Index: stable/10/sys/dev/hyperv/vmbus/hv_connection.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_connection.c +++ stable/10/sys/dev/hyperv/vmbus/hv_connection.c @@ -26,6 +26,9 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +__FBSDID("$FreeBSD$"); + #include #include #include @@ -45,14 +48,113 @@ { .connect_state = HV_DISCONNECTED, .next_gpadl_handle = 0xE1E10, }; +uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008; + +static uint32_t +hv_vmbus_get_next_version(uint32_t current_ver) +{ + switch (current_ver) { + case (HV_VMBUS_VERSION_WIN7): + return(HV_VMBUS_VERSION_WS2008); + + case (HV_VMBUS_VERSION_WIN8): + return(HV_VMBUS_VERSION_WIN7); + + case (HV_VMBUS_VERSION_WIN8_1): + return(HV_VMBUS_VERSION_WIN8); + + case (HV_VMBUS_VERSION_WS2008): + default: + return(HV_VMBUS_VERSION_INVALID); + } +} + +/** + * Negotiate the highest supported hypervisor version. + */ +static int +hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, + uint32_t version) +{ + int ret = 0; + hv_vmbus_channel_initiate_contact *msg; + + sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); + msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; + msg->vmbus_version_requested = version; + + msg->interrupt_page = hv_get_phys_addr( + hv_vmbus_g_connection.interrupt_page); + + msg->monitor_page_1 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_pages); + + msg->monitor_page_2 = + hv_get_phys_addr( + ((uint8_t *) hv_vmbus_g_connection.monitor_pages + + PAGE_SIZE)); + + /** + * Add to list before we send the request since we may receive the + * response before returning from this routine + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + msg, + sizeof(hv_vmbus_channel_initiate_contact)); + + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + return (ret); + } + + /** + * Wait for the connection response + */ + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + /** + * Check if successful + */ + if (msg_info->response.version_response.version_supported) { + hv_vmbus_g_connection.connect_state = HV_CONNECTED; + } else { + ret = ECONNREFUSED; + } + + return (ret); +} + /** * Send a connect request on the partition service connection */ int hv_vmbus_connect(void) { int ret = 0; + uint32_t version; hv_vmbus_channel_msg_info* msg_info = NULL; - hv_vmbus_channel_initiate_contact* msg; /** * Make sure we are not connecting or connected @@ -74,7 +176,7 @@ TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", - NULL, MTX_SPIN); + NULL, MTX_DEF); /** * Setup the vmbus event connection for channel interrupt abstraction @@ -130,71 +232,30 @@ goto cleanup; } - sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); - msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; - - msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; - msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; - - msg->interrupt_page = hv_get_phys_addr( - hv_vmbus_g_connection.interrupt_page); - - msg->monitor_page_1 = hv_get_phys_addr( - hv_vmbus_g_connection.monitor_pages); - - msg->monitor_page_2 = - hv_get_phys_addr( - ((uint8_t *) hv_vmbus_g_connection.monitor_pages - + PAGE_SIZE)); - - /** - * Add to list before we send the request since we may receive the - * response before returning from this routine + /* + * Find the highest vmbus version number we can support. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); + version = HV_VMBUS_VERSION_CURRENT; - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - ret = hv_vmbus_post_message( - msg, - sizeof(hv_vmbus_channel_initiate_contact)); - - if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - goto cleanup; - } + do { + ret = hv_vmbus_negotiate_version(msg_info, version); + if (ret == EWOULDBLOCK) { + /* + * We timed out. + */ + goto cleanup; + } - /** - * Wait for the connection response - */ - ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + if (hv_vmbus_g_connection.connect_state == HV_CONNECTED) + break; - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + version = hv_vmbus_get_next_version(version); + } while (version != HV_VMBUS_VERSION_INVALID); - /** - * Check if successful - */ - if (msg_info->response.version_response.version_supported) { - hv_vmbus_g_connection.connect_state = HV_CONNECTED; - } else { - ret = ECONNREFUSED; - goto cleanup; - } + hv_vmbus_protocal_version = version; + if (bootverbose) + printf("VMBUS: Portocal Version: %d.%d\n", + version >> 16, version & 0xFFFF); sema_destroy(&msg_info->wait_sema); free(msg_info, M_DEVBUF); @@ -286,7 +347,7 @@ * and channels are accessed without the need to take this lock or search * the list. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { @@ -295,7 +356,7 @@ break; } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); return (foundChannel); } @@ -306,7 +367,10 @@ static void VmbusProcessChannelEvent(uint32_t relid) { + void* arg; + uint32_t bytes_to_read; hv_vmbus_channel* channel; + boolean_t is_batched_reading; /** * Find the channel based on this relid and invokes @@ -327,31 +391,98 @@ * callback to NULL. This closes the window. */ - mtx_lock(&channel->inbound_lock); + /* + * Disable the lock due to newly added WITNESS check in r277723. + * Will seek other way to avoid race condition. + * -- whu + */ + // mtx_lock(&channel->inbound_lock); if (channel->on_channel_callback != NULL) { - channel->on_channel_callback(channel->channel_callback_context); + arg = channel->channel_callback_context; + is_batched_reading = channel->batched_reading; + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + */ + do { + if (is_batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + + channel->on_channel_callback(arg); + + if (is_batched_reading) + bytes_to_read = + hv_ring_buffer_read_end(&channel->inbound); + else + bytes_to_read = 0; + } while (is_batched_reading && (bytes_to_read != 0)); } - mtx_unlock(&channel->inbound_lock); + // mtx_unlock(&channel->inbound_lock); } +#ifdef HV_DEBUG_INTR +extern uint32_t hv_intr_count; +extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +extern uint32_t hv_vmbus_intr_cpu[MAXCPU]; +#endif + /** * Handler for events */ void hv_vmbus_on_events(void *arg) { - int dword; int bit; + int cpu; + int dword; + void *page_addr; + uint32_t* recv_interrupt_page = NULL; int rel_id; - int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + int maxdword; + hv_vmbus_synic_event_flags *event; /* int maxdword = PAGE_SIZE >> 3; */ - /* - * receive size is 1/2 page and divide that by 4 bytes - */ - - uint32_t* recv_interrupt_page = - hv_vmbus_g_connection.recv_interrupt_page; + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " + "cpu out of range!")); + +#ifdef HV_DEBUG_INTR + int i; + hv_vmbus_swintr_event_cpu[cpu]++; + if (hv_intr_count % 10000 == 0) { + printf("VMBUS: Total interrupt %d\n", hv_intr_count); + for (i = 0; i < mp_ncpus; i++) + printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n", + i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]); + } +#endif + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + /* + * receive size is 1/2 page and divide that by 4 bytes + */ + recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; + } else { + /* + * On Host with Win8 or above, the event page can be + * checked directly to get the id of the channel + * that has the pending interrupt. + */ + maxdword = HV_EVENT_FLAGS_DWORD_COUNT; + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags *) + page_addr + HV_VMBUS_MESSAGE_SINT; + recv_interrupt_page = event->flags32; + } /* * Check events @@ -416,16 +547,16 @@ * Send an event notification to the parent */ int -hv_vmbus_set_event(uint32_t child_rel_id) { +hv_vmbus_set_event(hv_vmbus_channel *channel) { int ret = 0; + uint32_t child_rel_id = channel->offer_msg.child_rel_id; /* Each uint32_t represents 32 channels */ synch_set_bit(child_rel_id & 31, (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + (child_rel_id >> 5)))); - ret = hv_vmbus_signal_event(); + ret = hv_vmbus_signal_event(channel->signal_event_param); return (ret); } - Index: stable/10/sys/dev/hyperv/vmbus/hv_hv.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_hv.c +++ stable/10/sys/dev/hyperv/vmbus/hv_hv.c @@ -67,8 +67,6 @@ hv_vmbus_context hv_vmbus_g_context = { .syn_ic_initialized = FALSE, .hypercall_page = NULL, - .signal_event_param = NULL, - .signal_event_buffer = NULL, }; static struct timecounter hv_timecounter = { @@ -256,28 +254,6 @@ hv_vmbus_g_context.hypercall_page = virt_addr; - /* - * Setup the global signal event param for the signal event hypercall - */ - hv_vmbus_g_context.signal_event_buffer = - malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, - M_ZERO | M_NOWAIT); - KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, - ("Error VMBUS: Failed to allocate signal_event_buffer\n")); - if (hv_vmbus_g_context.signal_event_buffer == NULL) - goto cleanup; - - hv_vmbus_g_context.signal_event_param = - (hv_vmbus_input_signal_event*) - (HV_ALIGN_UP((unsigned long) - hv_vmbus_g_context.signal_event_buffer, - HV_HYPERCALL_PARAM_ALIGN)); - hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; - hv_vmbus_g_context.signal_event_param->connection_id.u.id = - HV_VMBUS_EVENT_CONNECTION_ID; - hv_vmbus_g_context.signal_event_param->flag_number = 0; - hv_vmbus_g_context.signal_event_param->rsvd_z = 0; - tc_init(&hv_timecounter); /* register virtual timecount */ return (0); @@ -303,12 +279,6 @@ { hv_vmbus_x64_msr_hypercall_contents hypercall_msr; - if (hv_vmbus_g_context.signal_event_buffer != NULL) { - free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); - hv_vmbus_g_context.signal_event_buffer = NULL; - hv_vmbus_g_context.signal_event_param = NULL; - } - if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { if (hv_vmbus_g_context.hypercall_page != NULL) { hypercall_msr.as_uint64_t = 0; @@ -370,13 +340,13 @@ * event IPC. (This involves a hypercall.) */ hv_vmbus_status -hv_vmbus_signal_event() +hv_vmbus_signal_event(void *con_id) { hv_vmbus_status status; status = hv_vmbus_do_hypercall( HV_CALL_SIGNAL_EVENT, - hv_vmbus_g_context.signal_event_param, + con_id, 0) & 0xFFFF; return (status); @@ -390,6 +360,7 @@ { int cpu; + uint64_t hv_vcpu_index; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; hv_vmbus_synic_scontrol sctrl; @@ -403,23 +374,14 @@ return; /* - * KYS: Looks like we can only initialize on cpu0; don't we support - * SMP guests? - * - * TODO: Need to add SMP support for FreeBSD V9 - */ - - if (cpu != 0) - return; - - /* * TODO: Check the version */ version = rdmsr(HV_X64_MSR_SVERSION); - - hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0]; - hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1]; + hv_vmbus_g_context.syn_ic_msg_page[cpu] = + setup_args->page_buffers[2 * cpu]; + hv_vmbus_g_context.syn_ic_event_page[cpu] = + setup_args->page_buffers[2 * cpu + 1]; /* * Setup the Synic's message page @@ -443,9 +405,10 @@ wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ + shared_sint.as_uint64_t = 0; shared_sint.u.vector = setup_args->vector; shared_sint.u.masked = FALSE; - shared_sint.u.auto_eoi = FALSE; + shared_sint.u.auto_eoi = TRUE; wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); @@ -458,6 +421,13 @@ hv_vmbus_g_context.syn_ic_initialized = TRUE; + /* + * Set up the cpuid mapping from Hyper-V to FreeBSD. + * The array is indexed using FreeBSD cpuid. + */ + hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX); + hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index; + return; } @@ -469,14 +439,10 @@ hv_vmbus_synic_sint shared_sint; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; - int cpu = PCPU_GET(cpuid); if (!hv_vmbus_g_context.syn_ic_initialized) return; - if (cpu != 0) - return; /* TODO: XXXKYS: SMP? */ - shared_sint.as_uint64_t = rdmsr( HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); Index: stable/10/sys/dev/hyperv/vmbus/hv_ring_buffer.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_ring_buffer.c +++ stable/10/sys/dev/hyperv/vmbus/hv_ring_buffer.c @@ -26,6 +26,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +__FBSDID("$FreeBSD$"); #include #include @@ -144,6 +146,69 @@ return (uint64_t) ring_info->ring_buffer->write_index << 32; } +void +hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info* ring_info) +{ + ring_info->ring_buffer->interrupt_mask = 1; + mb(); +} + +uint32_t +hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t read, write; + + ring_info->ring_buffer->interrupt_mask = 0; + mb(); + + /* + * Now check to see if the ring buffer is still empty. + * If it is not, we raced and we need to process new + * incoming messages. + */ + get_ring_buffer_avail_bytes(ring_info, &read, &write); + + return (read); +} + +/* + * When we write to the ring buffer, check if the host needs to + * be signaled. Here is the details of this protocol: + * + * 1. The host guarantees that while it is draining the + * ring buffer, it will set the interrupt_mask to + * indicate it does not need to be interrupted when + * new data is placed. + * + * 2. The host guarantees that it will completely drain + * the ring buffer before exiting the read loop. Further, + * once the ring buffer is empty, it will clear the + * interrupt_mask and re-check to see if new data has + * arrived. + */ +static boolean_t +hv_ring_buffer_needsig_on_write( + uint32_t old_write_location, + hv_vmbus_ring_buffer_info* rbi) +{ + mb(); + if (rbi->ring_buffer->interrupt_mask) + return (FALSE); + + /* Read memory barrier */ + rmb(); + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + if (old_write_location == rbi->ring_buffer->read_index) + return (TRUE); + + return (FALSE); +} + static uint32_t copy_to_ring_buffer( hv_vmbus_ring_buffer_info* ring_info, uint32_t start_write_offset, @@ -204,11 +269,13 @@ hv_ring_buffer_write( hv_vmbus_ring_buffer_info* out_ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buffer_count) + uint32_t sg_buffer_count, + boolean_t *need_sig) { int i = 0; uint32_t byte_avail_to_write; uint32_t byte_avail_to_read; + uint32_t old_write_location; uint32_t total_bytes_to_write = 0; volatile uint32_t next_write_location; @@ -242,6 +309,8 @@ */ next_write_location = get_next_write_location(out_ring_info); + old_write_location = next_write_location; + for (i = 0; i < sg_buffer_count; i++) { next_write_location = copy_to_ring_buffer(out_ring_info, next_write_location, (char *) sg_buffers[i].data, @@ -258,9 +327,9 @@ (char *) &prev_indices, sizeof(uint64_t)); /* - * Make sure we flush all writes before updating the writeIndex + * Full memory barrier before upding the write index. */ - wmb(); + mb(); /* * Now, update the write location @@ -269,6 +338,9 @@ mtx_unlock_spin(&out_ring_info->ring_lock); + *need_sig = hv_ring_buffer_needsig_on_write(old_write_location, + out_ring_info); + return (0); } Index: stable/10/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ stable/10/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -53,22 +53,17 @@ #include #include +#include +#include #include +#include #include "hv_vmbus_priv.h" #define VMBUS_IRQ 0x5 -static struct intr_event *hv_msg_intr_event; -static struct intr_event *hv_event_intr_event; -static void *msg_swintr; -static void *event_swintr; static device_t vmbus_devp; -static void *vmbus_cookiep; -static int vmbus_rid; -struct resource *intr_res; -static int vmbus_irq = VMBUS_IRQ; static int vmbus_inited; static hv_setup_args setup_args; /* only CPU 0 supported at this time */ @@ -77,14 +72,17 @@ * the hypervisor. */ static void -vmbus_msg_swintr(void *dummy) +vmbus_msg_swintr(void *arg) { int cpu; void* page_addr; hv_vmbus_message* msg; hv_vmbus_message* copied; - cpu = PCPU_GET(cpuid); + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " + "cpu out of range!")); + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; @@ -130,17 +128,8 @@ * * The purpose of this routine is to determine the type of VMBUS protocol * message to process - an event or a channel message. - * As this is an interrupt filter routine, the function runs in a very - * restricted envinronment. From the manpage for bus_setup_intr(9) - * - * In this restricted environment, care must be taken to account for all - * races. A careful analysis of races should be done as well. It is gener- - * ally cheaper to take an extra interrupt, for example, than to protect - * variables with spinlocks. Read, modify, write cycles of hardware regis- - * ters need to be carefully analyzed if other threads are accessing the - * same registers. */ -static int +static inline int hv_vmbus_isr(void *unused) { int cpu; @@ -149,8 +138,6 @@ void* page_addr; cpu = PCPU_GET(cpuid); - /* (Temporary limit) */ - KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); /* * The Windows team has advised that we check for events @@ -162,9 +149,21 @@ event = (hv_vmbus_synic_event_flags*) page_addr + HV_VMBUS_MESSAGE_SINT; - /* Since we are a child, we only need to check bit 0 */ - if (synch_test_and_clear_bit(0, &event->flags32[0])) { - swi_sched(event_swintr, 0); + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + /* Since we are a child, we only need to check bit 0 */ + if (synch_test_and_clear_bit(0, &event->flags32[0])) { + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + } + } else { + /* + * On host with Win8 or above, we can directly look at + * the event page. If bit n is set, we have an interrupt + * on the channel with id n. + * Directly schedule the event software interrupt on + * current cpu. + */ + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); } /* Check if there are actual msgs to be process */ @@ -172,12 +171,47 @@ msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { - swi_sched(msg_swintr, 0); + swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); } return FILTER_HANDLED; } +#ifdef HV_DEBUG_INTR +uint32_t hv_intr_count = 0; +#endif +uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +uint32_t hv_vmbus_intr_cpu[MAXCPU]; + +void +hv_vector_handler(struct trapframe *trap_frame) +{ +#ifdef HV_DEBUG_INTR + int cpu; +#endif + + /* + * Disable preemption. + */ + critical_enter(); + +#ifdef HV_DEBUG_INTR + /* + * Do a little interrupt counting. + */ + cpu = PCPU_GET(cpuid); + hv_vmbus_intr_cpu[cpu]++; + hv_intr_count++; +#endif + + hv_vmbus_isr(NULL); + + /* + * Enable preemption. + */ + critical_exit(); +} + static int vmbus_read_ivar( device_t dev, @@ -316,6 +350,81 @@ return (BUS_PROBE_NOWILDCARD); } +#ifdef HYPERV +extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); + +/** + * @brief Find a free IDT slot and setup the interrupt handler. + */ +static int +vmbus_vector_alloc(void) +{ + int vector; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards form the highest IDT vector available for use + * as vmbus channel callback vector. We install 'hv_vmbus_callback' + * handler at that vector and use it to interrupt vcpus. + */ + vector = APIC_SPURIOUS_INT; + while (--vector >= APIC_IPI_INTS) { + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { +#ifdef __i386__ + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#else + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, + SEL_KPL, 0); +#endif + + return (vector); + } + } + return (0); +} + +/** + * @brief Restore the IDT slot to rsvd. + */ +static void +vmbus_vector_free(int vector) +{ + uintptr_t func; + struct gate_descriptor *ip; + + if (vector == 0) + return; + + KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, + ("invalid vector %d", vector)); + + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), + ("invalid vector %d", vector)); + + setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +#else /* HYPERV */ + +static int +vmbus_vector_alloc(void) +{ + return(0); +} + +static void +vmbus_vector_free(int vector) +{ +} + +#endif /* HYPERV */ + /** * @brief Main vmbus driver initialization routine. * @@ -331,22 +440,7 @@ static int vmbus_bus_init(void) { - struct ioapic_intsrc { - struct intsrc io_intsrc; - u_int io_irq; - u_int io_intpin:8; - u_int io_vector:8; - u_int io_cpu:8; - u_int io_activehi:1; - u_int io_edgetrigger:1; - u_int io_masked:1; - int io_bus:4; - uint32_t io_lowreg; - }; - int i, ret; - unsigned int vector = 0; - struct intsrc *isrc; - struct ioapic_intsrc *intpin; + int i, j, n, ret; if (vmbus_inited) return (0); @@ -361,80 +455,100 @@ return (ret); } - ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, - NULL, SWI_CLOCK, 0, &msg_swintr); - - if (ret) - goto cleanup; - /* - * Message SW interrupt handler checks a per-CPU page and - * thus the thread needs to be bound to CPU-0 - which is where - * all interrupts are processed. + * Find a free IDT slot for vmbus callback. */ - ret = intr_event_bind(hv_msg_intr_event, 0); + hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc(); - if (ret) - goto cleanup1; + if (hv_vmbus_g_context.hv_cb_vector == 0) { + if(bootverbose) + printf("Error VMBUS: Cannot find free IDT slot for " + "vmbus callback!\n"); + goto cleanup; + } - ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, - NULL, SWI_CLOCK, 0, &event_swintr); + if(bootverbose) + printf("VMBUS: vmbus callback vector %d\n", + hv_vmbus_g_context.hv_cb_vector); - if (ret) - goto cleanup1; + /* + * Notify the hypervisor of our vector. + */ + setup_args.vector = hv_vmbus_g_context.hv_cb_vector; - intr_res = bus_alloc_resource(vmbus_devp, - SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); + CPU_FOREACH(j) { + hv_vmbus_intr_cpu[j] = 0; + hv_vmbus_swintr_event_cpu[j] = 0; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.event_swintr[j] = NULL; + hv_vmbus_g_context.msg_swintr[j] = NULL; - if (intr_res == NULL) { - ret = ENOMEM; /* XXXKYS: Need a better errno */ - goto cleanup2; + for (i = 0; i < 2; i++) + setup_args.page_buffers[2 * j + i] = NULL; } /* - * Setup interrupt filter handler + * Per cpu setup. */ - ret = bus_setup_intr(vmbus_devp, intr_res, - INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, - NULL, &vmbus_cookiep); - - if (ret != 0) - goto cleanup3; - - ret = bus_bind_intr(vmbus_devp, intr_res, 0); - if (ret != 0) - goto cleanup4; - - isrc = intr_lookup_source(vmbus_irq); - if ((isrc == NULL) || (isrc->is_event == NULL)) { - ret = EINVAL; - goto cleanup4; - } + CPU_FOREACH(j) { + /* + * Setup software interrupt thread and handler for msg handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], + "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, + &hv_vmbus_g_context.msg_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup msg swi for " + "cpu %d\n", j); + goto cleanup1; + } - /* vector = isrc->is_event->ie_vector; */ - intpin = (struct ioapic_intsrc *)isrc; - vector = intpin->io_vector; + /* + * Bind the swi thread to the cpu. + */ + ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], + j); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to bind msg swi thread " + "to cpu %d\n", j); + goto cleanup1; + } - if(bootverbose) - printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); + /* + * Setup software interrupt thread and handler for + * event handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], + "hv_event", hv_vmbus_on_events, (void *)(long)j, + SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup event swi for " + "cpu %d\n", j); + goto cleanup1; + } - /** - * Notify the hypervisor of our irq. - */ - setup_args.vector = vector; - for(i = 0; i < 2; i++) { - setup_args.page_buffers[i] = + /* + * Prepare the per cpu msg and event pages to be called on each cpu. + */ + for(i = 0; i < 2; i++) { + setup_args.page_buffers[2 * j + i] = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - if (setup_args.page_buffers[i] == NULL) { - KASSERT(setup_args.page_buffers[i] != NULL, + if (setup_args.page_buffers[2 * j + i] == NULL) { + KASSERT(setup_args.page_buffers[2 * j + i] != NULL, ("Error VMBUS: malloc failed!")); - if (i > 0) - free(setup_args.page_buffers[0], M_DEVBUF); - goto cleanup4; + goto cleanup1; + } } } - /* only CPU #0 supported at this time */ + if (bootverbose) + printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n", + smp_started); + smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args); /* @@ -443,26 +557,32 @@ ret = hv_vmbus_connect(); if (ret != 0) - goto cleanup4; + goto cleanup1; hv_vmbus_request_channel_offers(); return (ret); - cleanup4: - + cleanup1: /* - * remove swi, bus and intr resource + * Free pages alloc'ed */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + for (n = 0; n < 2 * MAXCPU; n++) + if (setup_args.page_buffers[n] != NULL) + free(setup_args.page_buffers[n], M_DEVBUF); - cleanup3: - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); - - cleanup2: - swi_remove(event_swintr); + /* + * remove swi and vmbus callback vector; + */ + CPU_FOREACH(j) { + if (hv_vmbus_g_context.msg_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[j]); + if (hv_vmbus_g_context.event_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[j]); + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + } - cleanup1: - swi_remove(msg_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); cleanup: hv_vmbus_cleanup(); @@ -515,20 +635,24 @@ smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); - for(i = 0; i < 2; i++) { + for(i = 0; i < 2 * MAXCPU; i++) { if (setup_args.page_buffers[i] != 0) free(setup_args.page_buffers[i], M_DEVBUF); } hv_vmbus_cleanup(); - /* remove swi, bus and intr resource */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); - - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); + /* remove swi */ + CPU_FOREACH(i) { + if (hv_vmbus_g_context.msg_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[i]); + if (hv_vmbus_g_context.event_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[i]); + hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; + hv_vmbus_g_context.hv_event_intr_event[i] = NULL; + } - swi_remove(msg_swintr); - swi_remove(event_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); return; } @@ -603,6 +727,6 @@ DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); MODULE_VERSION(vmbus,1); -/* TODO: We want to be earlier than SI_SUB_VFS */ -SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); +/* We want to be started after SMP is initialized */ +SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL); Index: stable/10/sys/dev/hyperv/vmbus/hv_vmbus_priv.h =================================================================== --- stable/10/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ stable/10/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -181,49 +181,30 @@ #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) -/* - * Connection identifier type - */ -typedef union { - uint32_t as_uint32_t; - struct { - uint32_t id:24; - uint32_t reserved:8; - } u; - -} __packed hv_vmbus_connection_id; - -/* - * Definition of the hv_vmbus_signal_event hypercall input structure - */ -typedef struct { - hv_vmbus_connection_id connection_id; - uint16_t flag_number; - uint16_t rsvd_z; -} __packed hv_vmbus_input_signal_event; - -typedef struct { - uint64_t align8; - hv_vmbus_input_signal_event event; -} __packed hv_vmbus_input_signal_event_buffer; - typedef struct { uint64_t guest_id; void* hypercall_page; hv_bool_uint8_t syn_ic_initialized; + + hv_vmbus_handle syn_ic_msg_page[MAXCPU]; + hv_vmbus_handle syn_ic_event_page[MAXCPU]; /* - * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. - * The input param is immutable in our usage and - * must be dynamic mem (vs stack or global). + * For FreeBSD cpuid to Hyper-V vcpuid mapping. */ - hv_vmbus_input_signal_event_buffer *signal_event_buffer; + uint32_t hv_vcpu_index[MAXCPU]; /* - * 8-bytes aligned of the buffer above + * Each cpu has its own software interrupt handler for channel + * event and msg handling. */ - hv_vmbus_input_signal_event *signal_event_param; - - hv_vmbus_handle syn_ic_msg_page[MAXCPU]; - hv_vmbus_handle syn_ic_event_page[MAXCPU]; + struct intr_event *hv_event_intr_event[MAXCPU]; + struct intr_event *hv_msg_intr_event[MAXCPU]; + void *event_swintr[MAXCPU]; + void *msg_swintr[MAXCPU]; + /* + * Host use this vector to intrrupt guest for vmbus channel + * event and msg. + */ + unsigned int hv_cb_vector; } hv_vmbus_context; /* @@ -368,7 +349,8 @@ TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; struct mtx channel_msg_lock; /** - * List of channels + * List of primary channels. Sub channels will be linked + * under their primary channel. */ TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; struct mtx channel_lock; @@ -560,6 +542,8 @@ uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; } hv_vmbus_synic_event_flags; +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX (0x40000002) /* * Define synthetic interrupt controller model specific registers @@ -618,7 +602,8 @@ int hv_ring_buffer_write( hv_vmbus_ring_buffer_info *ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buff_count); + uint32_t sg_buff_count, + boolean_t *need_sig); int hv_ring_buffer_peek( hv_vmbus_ring_buffer_info *ring_info, @@ -638,6 +623,12 @@ hv_vmbus_ring_buffer_info *ring_info, char *prefix); +void hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info *ring_info); + +uint32_t hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info *ring_info); + hv_vmbus_channel* hv_vmbus_allocate_channel(void); void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); void hv_vmbus_on_channel_message(void *context); @@ -652,7 +643,7 @@ void *payload, size_t payload_size); -uint16_t hv_vmbus_signal_event(void); +uint16_t hv_vmbus_signal_event(void *con_id); void hv_vmbus_synic_init(void *irq_arg); void hv_vmbus_synic_cleanup(void *arg); int hv_vmbus_query_hypervisor_presence(void); @@ -674,7 +665,7 @@ int hv_vmbus_connect(void); int hv_vmbus_disconnect(void); int hv_vmbus_post_message(void *buffer, size_t buf_size); -int hv_vmbus_set_event(uint32_t child_rel_id); +int hv_vmbus_set_event(hv_vmbus_channel *channel); void hv_vmbus_on_events(void *); @@ -718,7 +709,7 @@ typedef struct { unsigned int vector; - void *page_buffers[2]; + void *page_buffers[2 * MAXCPU]; } hv_setup_args; #endif /* __HYPERV_PRIV_H__ */ Index: stable/10/sys/i386/conf/GENERIC =================================================================== --- stable/10/sys/i386/conf/GENERIC +++ stable/10/sys/i386/conf/GENERIC @@ -356,7 +356,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations Index: stable/10/sys/i386/i386/apic_vector.s =================================================================== --- stable/10/sys/i386/i386/apic_vector.s +++ stable/10/sys/i386/i386/apic_vector.s @@ -157,6 +157,25 @@ jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + SET_KERNEL_SREGS + cld + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call hv_vector_handler + add $4, %esp + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown.