Index: sys/conf/files.x86 =================================================================== --- sys/conf/files.x86 +++ sys/conf/files.x86 @@ -113,6 +113,7 @@ dev/hwpmc/hwpmc_uncore.c optional hwpmc dev/hwpmc/hwpmc_tsc.c optional hwpmc dev/hwpmc/hwpmc_x86.c optional hwpmc +dev/hyperv/hvsock/hv_sock.c optional hyperv dev/hyperv/pcib/vmbus_pcib.c optional hyperv pci dev/hyperv/netvsc/hn_nvs.c optional hyperv dev/hyperv/netvsc/hn_rndis.c optional hyperv Index: sys/dev/hyperv/hvsock/hv_sock.h =================================================================== --- sys/dev/hyperv/hvsock/hv_sock.h +++ sys/dev/hyperv/hvsock/hv_sock.h @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HVSOCK_H +#define _HVSOCK_H +#include +#include +#include + +#include +#include + +/* + * HyperV Socket Protocols + */ +#define HYPERV_SOCK_PROTO_TRANS 1 /* Transport protocol */ + +#define HVADDR_PORT_ANY -1U +#define HVADDR_PORT_UNKNOWN -1U + +#define HVS_LIST_BOUND 0x01 +#define HVS_LIST_CONNECTED 0x02 +#define HVS_LIST_ALL (HVS_LIST_BOUND | HVS_LIST_CONNECTED) + +struct sockaddr_hvs { + unsigned char sa_len; + sa_family_t sa_family; + unsigned int hvs_port; + unsigned char hvs_zero[sizeof(struct sockaddr) - + sizeof(sa_family_t) - + sizeof(unsigned char) - + sizeof(unsigned int)]; +}; + +struct vmpipe_proto_header { + uint32_t vmpipe_pkt_type; + uint32_t vmpipe_data_size; +} __packed; + +struct hvs_pkt_header { + struct vmbus_chanpkt_hdr chan_pkt_hdr; + struct vmpipe_proto_header vmpipe_pkt_hdr; +} __packed; + +struct hvs_pcb { + struct socket *so; /* Pointer to socket */ + struct sockaddr_hvs local_addr; + struct sockaddr_hvs remote_addr; + + struct hyperv_guid vm_srv_id; + struct hyperv_guid host_srv_id; + + struct vmbus_channel *chan; + /* Current packet header on rx ring */ + struct hvs_pkt_header hvs_pkt; + /* Available data in receive br in current packet */ + uint32_t recv_data_len; + /* offset in the packet */ + uint32_t recv_data_off; + bool rb_init; + /* Link lists for global bound and connected sockets */ + LIST_ENTRY(hvs_pcb) bound_next; + LIST_ENTRY(hvs_pcb) connected_next; +}; + +#define so2hvspcb(so) \ + ((struct hvs_pcb *)((so)->so_pcb)) +#define hsvpcb2so(hvspcb) \ + ((struct socket *)((hvspcb)->so)) + +void hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *); +void hvs_trans_init(void); +void hvs_trans_close(struct socket *); +void hvs_trans_detach(struct socket *); +void hvs_trans_abort(struct socket *); +int hvs_trans_attach(struct socket *, int, struct thread *); +int hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *); +int hvs_trans_listen(struct socket *, int, struct thread *); +int hvs_trans_accept(struct socket *, struct sockaddr **); +int hvs_trans_connect(struct socket *, + struct sockaddr *, struct thread *); +int hvs_trans_peeraddr(struct socket *, struct sockaddr **); +int hvs_trans_sockaddr(struct socket *, struct sockaddr **); +int hvs_trans_soreceive(struct socket *, struct sockaddr **, + struct uio *, struct mbuf **, struct mbuf **, int *); +int hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *, + struct mbuf *, struct mbuf *, int, struct thread *); +int hvs_trans_disconnect(struct socket *); +int hvs_trans_shutdown(struct socket *); + +int hvs_trans_lock(void); +void hvs_trans_unlock(void); + +void hvs_remove_socket_from_list(struct socket *, unsigned char); +#endif /* _HVSOCK_H */ Index: sys/dev/hyperv/hvsock/hv_sock.c =================================================================== --- sys/dev/hyperv/hvsock/hv_sock.c +++ sys/dev/hyperv/hvsock/hv_sock.c @@ -0,0 +1,1748 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "hv_sock.h" + +#define HVSOCK_DBG_NONE 0x0 +#define HVSOCK_DBG_INFO 0x1 +#define HVSOCK_DBG_ERR 0x2 +#define HVSOCK_DBG_VERBOSE 0x3 + + +SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); + +static int hvs_dbg_level; +SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, + 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); + + +#define HVSOCK_DBG(level, ...) do { \ + if (hvs_dbg_level >= (level)) \ + printf(__VA_ARGS__); \ + } while (0) + +MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); + +/* The MTU is 16KB per host side's design */ +#define HVSOCK_MTU_SIZE (1024 * 16) +#define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) + +#define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) + +#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ + roundup2(payload_len, 8) + \ + sizeof(uint64_t)) + + +static struct domain hv_socket_domain; + +/* + * HyperV Transport sockets + */ +static struct pr_usrreqs hvs_trans_usrreqs = { + .pru_attach = hvs_trans_attach, + .pru_bind = hvs_trans_bind, + .pru_listen = hvs_trans_listen, + .pru_accept = hvs_trans_accept, + .pru_connect = hvs_trans_connect, + .pru_peeraddr = hvs_trans_peeraddr, + .pru_sockaddr = hvs_trans_sockaddr, + .pru_soreceive = hvs_trans_soreceive, + .pru_sosend = hvs_trans_sosend, + .pru_disconnect = hvs_trans_disconnect, + .pru_close = hvs_trans_close, + .pru_detach = hvs_trans_detach, + .pru_shutdown = hvs_trans_shutdown, + .pru_abort = hvs_trans_abort, +}; + +/* + * Definitions of protocols supported in HyperV socket domain + */ +static struct protosw hv_socket_protosw[] = { +{ + .pr_type = SOCK_STREAM, + .pr_domain = &hv_socket_domain, + .pr_protocol = HYPERV_SOCK_PROTO_TRANS, + .pr_flags = PR_CONNREQUIRED, + .pr_init = hvs_trans_init, + .pr_usrreqs = &hvs_trans_usrreqs, +}, +}; + +static struct domain hv_socket_domain = { + .dom_family = AF_HYPERV, + .dom_name = "hyperv", + .dom_protosw = hv_socket_protosw, + .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] +}; + +VNET_DOMAIN_SET(hv_socket_); + +#define MAX_PORT ((uint32_t)0xFFFFFFFF) +#define MIN_PORT ((uint32_t)0x0) + +/* 00000000-facb-11e6-bd58-64006a7986d3 */ +static const struct hyperv_guid srv_id_template = { + .hv_guid = { + 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, + 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } +}; + +static int hvsock_br_callback(void *, int, void *); +static uint32_t hvsock_canread_check(struct hvs_pcb *); +static uint32_t hvsock_canwrite_check(struct hvs_pcb *); +static int hvsock_send_data(struct vmbus_channel *chan, + struct uio *uio, uint32_t to_write, struct sockbuf *sb); + + + +/* Globals */ +static struct sx hvs_trans_socks_sx; +static struct mtx hvs_trans_socks_mtx; +static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; +static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; +static uint32_t previous_auto_bound_port; + +static void +hvsock_print_guid(struct hyperv_guid *guid) +{ + unsigned char *p = (unsigned char *)guid; + + HVSOCK_DBG(HVSOCK_DBG_INFO, + "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", + *(unsigned int *)p, + *((unsigned short *) &p[4]), + *((unsigned short *) &p[6]), + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); +} + +static bool +is_valid_srv_id(const struct hyperv_guid *id) +{ + return !memcmp(&id->hv_guid[4], + &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); +} + +static unsigned int +get_port_by_srv_id(const struct hyperv_guid *srv_id) +{ + return *((const unsigned int *)srv_id); +} + +static void +set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) +{ + *((unsigned int *)srv_id) = port; +} + + +static void +__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + if (!pcb) + return; + + if (list & HVS_LIST_BOUND) { + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p == pcb) + LIST_REMOVE(p, bound_next); + } + + if (list & HVS_LIST_CONNECTED) { + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p == pcb) + LIST_REMOVE(pcb, connected_next); + } +} + +static void +__hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + __hvs_remove_pcb_from_list(pcb, list); +} + +static void +__hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + if (list & HVS_LIST_BOUND) + LIST_INSERT_HEAD(&hvs_trans_bound_socks, + pcb, bound_next); + + if (list & HVS_LIST_CONNECTED) + LIST_INSERT_HEAD(&hvs_trans_connected_socks, + pcb, connected_next); +} + +void +hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static void +hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_insert_socket_on_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static struct socket * +__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + if (list & HVS_LIST_BOUND) + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + if (list & HVS_LIST_CONNECTED) + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + return NULL; +} + +static struct socket * +hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct socket *s = NULL; + + mtx_lock(&hvs_trans_socks_mtx); + s = __hvs_find_socket_on_list(addr, list); + mtx_unlock(&hvs_trans_socks_mtx); + + return s; +} + +static inline void +hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) +{ + memset(addr, 0, sizeof(*addr)); + addr->sa_family = AF_HYPERV; + addr->hvs_port = port; +} + +void +hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) +{ + hvs_addr_set(addr, get_port_by_srv_id(svr_id)); +} + +int +hvs_trans_lock(void) +{ + sx_xlock(&hvs_trans_socks_sx); + return (0); +} + +void +hvs_trans_unlock(void) +{ + sx_xunlock(&hvs_trans_socks_sx); +} + +void +hvs_trans_init(void) +{ + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + if (vm_guest != VM_GUEST_HV) + return; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_init called\n", __func__); + + /* Initialize Globals */ + previous_auto_bound_port = MAX_PORT; + sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); + mtx_init(&hvs_trans_socks_mtx, + "hvs_trans_socks_mtx", NULL, MTX_DEF); + LIST_INIT(&hvs_trans_bound_socks); + LIST_INIT(&hvs_trans_connected_socks); +} + +/* + * Called in two cases: + * 1) When user calls socket(); + * 2) When we accept new incoming conneciton and call sonewconn(). + */ +int +hvs_trans_attach(struct socket *so, int proto, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_attach called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (ESOCKTNOSUPPORT); + + if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) + return (EPROTONOSUPPORT); + + if (pcb != NULL) + return (EISCONN); + pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); + if (pcb == NULL) + return (ENOMEM); + + pcb->so = so; + so->so_pcb = (void *)pcb; + + return (0); +} + +void +hvs_trans_detach(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_detach called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + bzero(pcb, sizeof(*pcb)); + free(pcb, M_HVSOCK); + } + + so->so_pcb = NULL; + + hvs_trans_unlock(); +} + +int +hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_bind called\n", __func__); + + if (sa == NULL) { + return (EINVAL); + } + + if (pcb == NULL) { + return (EINVAL); + } + + if (sa->sa_family != AF_HYPERV) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Not supported, sa_family is %u\n", + __func__, sa->sa_family); + return (EAFNOSUPPORT); + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: binding port = 0x%x\n", __func__, sa->hvs_port); + + mtx_lock(&hvs_trans_socks_mtx); + if (__hvs_find_socket_on_list(sa, + HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { + error = EADDRINUSE; + } else { + /* + * The address is available for us to bind. + * Add socket to the bound list. + */ + hvs_addr_set(&pcb->local_addr, sa->hvs_port); + hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + } + mtx_unlock(&hvs_trans_socks_mtx); + + return (error); +} + +int +hvs_trans_listen(struct socket *so, int backlog, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct socket *bound_so; + int error; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_listen called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* Check if the address is already bound and it was by us. */ + bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); + if (bound_so == NULL || bound_so != so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Address not bound or not by us.\n", __func__); + return (EADDRNOTAVAIL); + } + + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0) + solisten_proto(so, backlog); + SOCK_UNLOCK(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket listen error = %d\n", __func__, error); + return (error); +} + +int +hvs_trans_accept(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_accept called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, + M_NOWAIT); + + return ((*nam == NULL) ? ENOMEM : 0); +} + +int +hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; + bool found_auto_bound_port = false; + int i, error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", + __func__, raddr->hvs_port); + + if (pcb == NULL) + return (EINVAL); + + /* Verify the remote address */ + if (raddr == NULL) + return (EINVAL); + if (raddr->sa_family != AF_HYPERV) + return (EAFNOSUPPORT); + + mtx_lock(&hvs_trans_socks_mtx); + if (so->so_state & + (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket connect in progress\n", + __func__); + error = EINPROGRESS; + goto out; + } + + /* + * Find an available port for us to auto bind the local + * address. + */ + hvs_addr_set(&pcb->local_addr, 0); + + for (i = previous_auto_bound_port - 1; + i != previous_auto_bound_port; i --) { + if (i == MIN_PORT) + i = MAX_PORT; + + pcb->local_addr.hvs_port = i; + + if (__hvs_find_socket_on_list(&pcb->local_addr, + HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { + found_auto_bound_port = true; + previous_auto_bound_port = i; + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: found local bound port is %x\n", + __func__, pcb->local_addr.hvs_port); + break; + } + } + + if (found_auto_bound_port == true) { + /* Found available port for auto bound, put on list */ + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + /* Set VM service ID */ + pcb->vm_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); + /* Set host service ID and remote port */ + pcb->host_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); + hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); + + /* Change the socket state to SS_ISCONNECTING */ + soisconnecting(so); + } else { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: No local port available for auto bound\n", + __func__); + error = EADDRINUSE; + } + + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); + hvsock_print_guid(&pcb->vm_srv_id); + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); + hvsock_print_guid(&pcb->host_srv_id); + +out: + mtx_unlock(&hvs_trans_socks_mtx); + + if (found_auto_bound_port == true) + vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); + + return (error); +} + +int +hvs_trans_disconnect(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return (EINVAL); + } + + /* If socket is already disconnected, skip this */ + if ((so->so_state & SS_ISDISCONNECTED) == 0) + soisdisconnecting(so); + + hvs_trans_unlock(); + + return (0); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) +struct hvs_callback_arg { + struct uio *uio; + struct sockbuf *sb; +}; + +int +hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canread, to_read; + int flags, error = 0; + struct hvs_callback_arg cbarg; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + + if (flags & MSG_PEEK) + return (EOPNOTSUPP); + + /* If no space to copy out anything */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) + return (EINVAL); + + sb = &so->so_rcv; + + orig_resid = uio->uio_resid; + + /* Prevent other readers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: sblock returned error = %d\n", __func__, error); + return (error); + } + + SOCKBUF_LOCK(sb); + + cbarg.uio = uio; + cbarg.sb = sb; + /* + * If the socket is closing, there might still be some data + * in rx br to read. However we need to make sure + * the channel is still open. + */ + if ((sb->sb_state & SBS_CANTRCVMORE) && + (so->so_state & SS_ISDISCONNECTED)) { + /* Other thread already closed the channel */ + error = EPIPE; + goto out; + } + + while (true) { + while (uio->uio_resid > 0 && + (canread = hvsock_canread_check(pcb)) > 0) { + to_read = MIN(canread, uio->uio_resid); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: to_read = %u, skip = %u\n", __func__, to_read, + (unsigned int)(sizeof(struct hvs_pkt_header) + + pcb->recv_data_off)); + + error = vmbus_chan_recv_peek_call(pcb->chan, to_read, + sizeof(struct hvs_pkt_header) + pcb->recv_data_off, + hvsock_br_callback, (void *)&cbarg); + /* + * It is possible socket is disconnected becasue + * we released lock in hvsock_br_callback. So we + * need to check the state to make sure it is not + * disconnected. + */ + if (error || so->so_state & SS_ISDISCONNECTED) { + break; + } + + pcb->recv_data_len -= to_read; + pcb->recv_data_off += to_read; + } + + if (error) + break; + + /* Abort if socket has reported problems. */ + if (so->so_error) { + if (so->so_error == ESHUTDOWN && + orig_resid > uio->uio_resid) { + /* + * Although we got a FIN, we also received + * some data in this round. Delivery it + * to user. + */ + error = 0; + } else { + if (so->so_error != ESHUTDOWN) + error = so->so_error; + } + + break; + } + + /* Cannot received more. */ + if (sb->sb_state & SBS_CANTRCVMORE) + break; + + /* We are done if buffer has been filled */ + if (uio->uio_resid == 0) + break; + + if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) + break; + + /* Buffer ring is empty and we shall not block */ + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { + if (orig_resid == uio->uio_resid) { + /* We have not read anything */ + error = EAGAIN; + } + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: non blocked read return, error %d.\n", + __func__, error); + break; + } + + /* + * Wait and block until (more) data comes in. + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, read available is %u\n", + __func__, vmbus_chan_read_available(pcb->chan)); + } + +out: + SOCKBUF_UNLOCK(sb); + + sbunlock(sb); + + /* We recieved a FIN in this call */ + if (so->so_error == ESHUTDOWN) { + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + /* Send has already closed */ + soisdisconnecting(so); + } else { + /* Just close the receive side */ + socantrcvmore(so); + } + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: returning error = %d, so_error = %d\n", + __func__, error, so->so_error); + + return (error); +} + +int +hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canwrite, to_write; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %lu\n", + __func__, uio->uio_resid); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + /* If nothing to send */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) + return (EINVAL); + + sb = &so->so_snd; + + orig_resid = uio->uio_resid; + + /* Prevent other writers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: sblock returned error = %d\n", __func__, error); + return (error); + } + + SOCKBUF_LOCK(sb); + + if ((sb->sb_state & SBS_CANTSENDMORE) || + so->so_error == ESHUTDOWN) { + error = EPIPE; + goto out; + } + + while (uio->uio_resid > 0) { + canwrite = hvsock_canwrite_check(pcb); + if (canwrite == 0) { + /* We have sent some data */ + if (orig_resid > uio->uio_resid) + break; + /* + * We have not sent any data and it is + * non-blocked io + */ + if (so->so_state & SS_NBIO || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { + error = EWOULDBLOCK; + break; + } else { + /* + * We are here because there is no space on + * send buffer ring. Signal the other side + * to read and free more space. + * Sleep wait until space avaiable to send + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, space avail on " + "tx ring is %u\n", + __func__, + vmbus_chan_write_available(pcb->chan)); + + continue; + } + } + to_write = MIN(canwrite, uio->uio_resid); + to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite is %u, to_write = %u\n", __func__, + canwrite, to_write); + error = hvsock_send_data(pcb->chan, uio, to_write, sb); + + if (error) + break; + } + +out: + SOCKBUF_UNLOCK(sb); + sbunlock(sb); + + return (error); +} + +int +hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +int +hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +void +hvs_trans_close(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_close called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (!pcb) { + hvs_trans_unlock(); + return; + } + + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hvs_trans_close sending a FIN to host\n", __func__); + (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); + } + + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + + pcb->chan = NULL; + pcb->so = NULL; + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + hvs_trans_unlock(); + + return; +} + +void +hvs_trans_abort(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_abort called\n", __func__); + + (void) hvs_trans_lock(); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + if (so->so_state & SS_ISCONNECTED) { + (void) sodisconnect(so); + } + hvs_trans_unlock(); + + return; +} + +int +hvs_trans_shutdown(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* + * Only get called with the shutdown method is SHUT_WR or + * SHUT_RDWR. + * When the method is SHUT_RD or SHUT_RDWR, the caller + * already set the SBS_CANTRCVMORE on receive side socket + * buffer. + */ + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { + /* + * SHUT_WR only case. + * Receive side is still open. Just close + * the send side. + */ + socantsendmore(so); + } else { + /* SHUT_RDWR case */ + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + (void) hvsock_send_data(pcb->chan, NULL, 0, sb); + SOCKBUF_UNLOCK(sb); + + soisdisconnecting(so); + } + } + + return (0); +} + +/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is + * (see struct sockaddr_hvs). + * + * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- + * guide/make-integration-service, and the endpoint is with + * the below sockaddr: + * + * struct SOCKADDR_HV + * { + * ADDRESS_FAMILY Family; + * USHORT Reserved; + * GUID VmId; + * GUID ServiceId; + * }; + * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via + * VMBus, because here it's obvious the host and the VM can easily identify + * each other. Though the VmID is useful on the host, especially in the case + * of Windows container, FreeBSD VM doesn't need it at all. + * + * To be compatible with similar infrastructure in Linux VMs, we have + * to limit the available GUID space of SOCKADDR_HV so that we can create + * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. + * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: + * + **************************************************************************** + * The only valid Service GUIDs, from the perspectives of both the host and * + * FreeBSD VM, that can be connected by the other end, must conform to this * + * format: -facb-11e6-bd58-64006a7986d3. * + **************************************************************************** + * + * When we write apps on the host to connect(), the GUID ServiceID is used. + * When we write apps in FreeBSD VM to connect(), we only need to specify the + * port and the driver will form the GUID and use that to request the host. + * + * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the + * auto-generated remote port for a connect request initiated by the host's + * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the + * FreeBSD guest. + */ + +/* + * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) + * restricts HyperV socket ring buffer size to six 4K pages. Newer + * HyperV hosts doen't have this limit. + */ +#define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) + +struct hvsock_sc { + device_t dev; + struct hvs_pcb *pcb; + struct vmbus_channel *channel; +}; + +static bool +hvsock_chan_readable(struct vmbus_channel *chan) +{ + uint32_t readable = vmbus_chan_read_available(chan); + + return (readable >= HVSOCK_PKT_LEN(0)); +} + +static void +hvsock_chan_cb(struct vmbus_channel *chan, void *context) +{ + struct hvs_pcb *pcb = (struct hvs_pcb *) context; + struct socket *so; + uint32_t canwrite; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: host send us a wakeup on rb data, pcb = %p\n", + __func__, pcb); + + /* + * Check if the socket is still attached and valid. + * Here we know channel is still open. Need to make + * sure the socket has not been closed or freed. + */ + (void) hvs_trans_lock(); + so = hsvpcb2so(pcb); + + if (pcb->chan != NULL && so != NULL) { + /* + * Wake up reader if there are data to read. + */ + SOCKBUF_LOCK(&(so)->so_rcv); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: read available = %u\n", __func__, + vmbus_chan_read_available(pcb->chan)); + + if (hvsock_chan_readable(pcb->chan)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&(so)->so_rcv); + + /* + * Wake up sender if space becomes available to write. + */ + SOCKBUF_LOCK(&(so)->so_snd); + canwrite = hvsock_canwrite_check(pcb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite = %u\n", __func__, canwrite); + + if (canwrite > 0) { + sowwakeup_locked(so); + } else { + SOCKBUF_UNLOCK(&(so)->so_snd); + } + } + + hvs_trans_unlock(); + + return; +} + +static int +hvsock_br_callback(void *datap, int cplen, void *cbarg) +{ + struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; + struct uio *uio = arg->uio; + struct sockbuf *sb = arg->sb; + int error = 0; + + if (cbarg == NULL || datap == NULL) + return (EINVAL); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: called, uio_rw = %s, uio_resid = %lu, cplen = %u, " + "datap = %p\n", + __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", + uio->uio_resid, cplen, datap); + + if (sb) + SOCKBUF_UNLOCK(sb); + + error = uiomove(datap, cplen, uio); + + if (sb) + SOCKBUF_LOCK(sb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after uiomove, uio_resid = %lu, error = %d\n", + __func__, uio->uio_resid, error); + + return (error); +} + +static int +hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, + uint32_t to_write, struct sockbuf *sb) +{ + struct hvs_pkt_header hvs_pkt; + int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; + uint64_t pad = 0; + struct iovec iov[3]; + struct hvs_callback_arg cbarg; + + if (chan == NULL) + return (ENOTCONN); + + hlen = sizeof(struct vmbus_chanpkt_hdr); + hvs_pkthlen = sizeof(struct hvs_pkt_header); + hvs_pktlen = hvs_pkthlen + to_write; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " + "pad_pktlen = %u, data_len = %u\n", + __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); + + hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; + hvs_pkt.chan_pkt_hdr.cph_flags = 0; + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); + hvs_pkt.chan_pkt_hdr.cph_xactid = 0; + + hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; + hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; + + cbarg.uio = uio; + cbarg.sb = sb; + + if (uio && to_write > 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = NULL; + iov[1].iov_len = to_write; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - hvs_pktlen; + + error = vmbus_chan_iov_send(chan, iov, 3, + hvsock_br_callback, &cbarg); + } else { + if (to_write == 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = &pad; + iov[1].iov_len = pad_pktlen - hvs_pktlen; + error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); + } + } + + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: error = %d\n", __func__, error); + } + + return (error); +} + +/* + * Check if we have data on current ring buffer to read + * or not. If not, advance the ring buffer read index to + * next packet. Update the recev_data_len and recev_data_off + * to new value. + * Return the number of bytes can read. + */ +static uint32_t +hvsock_canread_check(struct hvs_pcb *pcb) +{ + uint32_t advance; + uint32_t tlen, hlen, dlen; + uint32_t bytes_canread = 0; + int error; + + if (pcb == NULL || pcb->chan == NULL) { + pcb->so->so_error = EIO; + return (0); + } + + /* Still have data not read yet on current packet */ + if (pcb->recv_data_len > 0) + return (pcb->recv_data_len); + + if (pcb->rb_init) + advance = + VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + else + advance = 0; + + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: bytes_canread on br = %u, advance = %u\n", + __func__, bytes_canread, advance); + + if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { + /* + * Nothing to read. Need to advance the rindex before + * calling sbwait, so host knows to wake us up when data + * is available to read on rb. + */ + error = vmbus_chan_recv_idxadv(pcb->chan, advance); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: after calling vmbus_chan_recv_idxadv, " + "got error = %d\n", __func__, error); + return (0); + } else { + pcb->rb_init = false; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: advanced %u bytes, " + " bytes_canread on br now = %u\n", + __func__, advance, bytes_canread); + + if (bytes_canread == 0) + return (0); + else + advance = 0; + } + } + + if (bytes_canread < + advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) + return (0); + + error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, + sizeof(struct hvs_pkt_header), advance); + + /* Don't have anything to read */ + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after calling vmbus_chan_recv_peek, got error = %d\n", + __func__, error); + return (0); + } + + /* + * We just read in a new packet header. Do some sanity checks. + */ + tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); + dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; + if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || + __predict_false(hlen > tlen) || + __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "invalid tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + pcb->so->so_error = EIO; + return (0); + } + if (pcb->rb_init == false) + pcb->rb_init = true; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + + /* The other side has sent a close FIN */ + if (dlen == 0) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: Received FIN from other side\n", __func__); + /* inform the caller by seting so_error to ESHUTDOWN */ + pcb->so->so_error = ESHUTDOWN; + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canread on receive ring is %u \n", __func__, dlen); + + pcb->recv_data_len = dlen; + pcb->recv_data_off = 0; + + return (pcb->recv_data_len); +} + +static uint32_t +hvsock_canwrite_check(struct hvs_pcb *pcb) +{ + uint32_t writeable; + uint32_t ret; + + if (pcb == NULL || pcb->chan == NULL) + return (0); + + writeable = vmbus_chan_write_available(pcb->chan); + + /* + * We must always reserve a 0-length-payload packet for the FIN. + */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: writeable is %u, should be greater than %lu\n", + __func__, writeable, HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)); + + if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { + /* + * The Tx ring seems full. + */ + return (0); + } + + ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: available size is %u\n", __func__, rounddown2(ret, 8)); + + return (rounddown2(ret, 8)); +} + +static void +hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) +{ + vmbus_chan_set_pending_send_size(chan, + HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); +} + +static int +hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) +{ + unsigned int rcvbuf, sndbuf; + struct hvs_pcb *pcb = so2hvspcb(so); + int ret; + + if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { + sndbuf = HVS_RINGBUF_SND_SIZE; + rcvbuf = HVS_RINGBUF_RCV_SIZE; + } else { + sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); + sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); + sndbuf = rounddown2(sndbuf, PAGE_SIZE); + rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); + rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); + rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); + } + + /* + * Can only read whatever user provided size of data + * from ring buffer. Turn off batched reading. + */ + vmbus_chan_set_readbatch(chan, false); + + ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, + hvsock_chan_cb, pcb); + + if (ret != 0) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: failed to open hvsock channel, sndbuf = %u, " + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + } else { + HVSOCK_DBG(HVSOCK_DBG_INFO, + "%s: hvsock channel opened, sndbuf = %u, i" + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + /* + * Se the pending send size so to receive wakeup + * signals from host when there is enough space on + * rx buffer ring to write. + */ + hvsock_set_chan_pending_send_size(chan); + } + + return ret; +} + +/* + * Guest is listening passively on the socket. Open channel and + * create a new socket for the conneciton. + */ +static void +hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, + struct hvsock_sc *sc) +{ + struct socket *new_so; + struct hvs_pcb *new_pcb, *pcb; + int error; + + /* Do nothing if socket is not listening */ + if ((so->so_options & SO_ACCEPTCONN) == 0) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket is not a listening one\n", __func__); + return; + } + + /* + * Create a new socket. This will call pru_attach to complete + * the socket initialization and put the new socket onto + * listening socket's sol_incomp list, waiting to be promoted + * to sol_comp list. + * The new socket created has ref count 0. There is no other + * thread that changes the state of this new one at the + * moment, so we don't need to hold its lock while opening + * channel and filling out its pcb information. + */ + new_so = sonewconn(so, 0); + if (!new_so) + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: creating new socket failed\n", __func__); + + /* + * Now open the vmbus channel. If it fails, the socket will be + * on the listening socket's sol_incomp queue until it is + * replaced and aborted. + */ + error = hvsock_open_channel(chan, new_so); + if (error) { + new_so->so_error = error; + return; + } + + pcb = so->so_pcb; + new_pcb = new_so->so_pcb; + + hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); + /* Remote port is unknown to guest in this type of conneciton */ + hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); + new_pcb->chan = chan; + new_pcb->recv_data_len = 0; + new_pcb->recv_data_off = 0; + new_pcb->rb_init = false; + + new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); + new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); + + hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); + + sc->pcb = new_pcb; + + /* + * Change the socket state to SS_ISCONNECTED. This will promote + * the socket to sol_comp queue and wake up the thread which + * is accepting connection. + */ + soisconnected(new_so); +} + + +/* + * Guest is actively connecting to host. + */ +static void +hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) +{ + struct hvs_pcb *pcb; + int error; + + error = hvsock_open_channel(chan, so); + if (error) { + so->so_error = error; + return; + } + + pcb = so->so_pcb; + pcb->chan = chan; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + pcb->rb_init = false; + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Change the socket state to SS_ISCONNECTED. This will wake up + * the thread sleeping in connect call. + */ + soisconnected(so); +} + +static void +hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) +{ + struct hyperv_guid *inst_guid, *type_guid; + bool conn_from_host; + struct sockaddr_hvs addr; + struct socket *so; + struct hvs_pcb *pcb; + + type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); + inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); + conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); + + HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); + hvsock_print_guid(type_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); + hvsock_print_guid(inst_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", + (conn_from_host == true ) ? "from" : "to"); + + /* + * The listening port should be in [0, MAX_LISTEN_PORT] + */ + if (!is_valid_srv_id(type_guid)) + return; + + /* + * There should be a bound socket already created no matter + * it is a passive or active connection. + * For host initiated connection (passive on guest side), + * the type_guid contains the port which guest is bound and + * listening. + * For the guest initiated connection (active on guest side), + * the inst_guid contains the port that guest has auto bound + * to. + */ + hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); + so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); + if (!so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: no bound socket found for port %u\n", + __func__, addr.hvs_port); + return; + } + + if (conn_from_host) { + hvsock_open_conn_passive(chan, so, sc); + } else { + (void) hvs_trans_lock(); + pcb = so->so_pcb; + if (pcb && pcb->so) { + sc->pcb = so2hvspcb(so); + hvsock_open_conn_active(chan, so); + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: channel detached before open\n", __func__); + } + hvs_trans_unlock(); + } + +} + +static int +hvsock_probe(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + + if (!channel || !vmbus_chan_is_hvs(channel)) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe called but not a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return ENXIO; + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe got a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return BUS_PROBE_DEFAULT; + } +} + +static int +hvsock_attach(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); + + hvsock_open_connection(channel, sc); + + /* + * Always return success. On error the host will rescind the device + * in 30 seconds and we can do cleanup at that time in + * vmbus_chan_msgproc_chrescind(). + */ + return (0); +} + +static int +hvsock_detach(device_t dev) +{ + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + struct socket *so; + int error, retry; + + if (bootverbose) + device_printf(dev, "hvsock_detach called.\n"); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); + + if (sc->pcb != NULL) { + (void) hvs_trans_lock(); + + so = hsvpcb2so(sc->pcb); + if (so) { + /* Close the connection */ + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_pcb_from_list(sc->pcb, + HVS_LIST_BOUND | HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Close channel while no reader and sender are working + * on the buffer rings. + */ + if (so) { + retry = 0; + while ((error = sblock(&so->so_rcv, 0)) == + EWOULDBLOCK) { + /* + * Someone is reading, rx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for rx reader to exit, " + "retry = %d\n", retry++); + } + retry = 0; + while ((error = sblock(&so->so_snd, 0)) == + EWOULDBLOCK) { + /* + * Someone is sending, tx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for tx sender to exit, " + "retry = %d\n", retry++); + } + } + + + bzero(sc->pcb, sizeof(struct hvs_pcb)); + free(sc->pcb, M_HVSOCK); + sc->pcb = NULL; + + if (so) { + sbunlock(&so->so_rcv); + sbunlock(&so->so_snd); + so->so_pcb = NULL; + } + + hvs_trans_unlock(); + } + + vmbus_chan_close(vmbus_get_channel(dev)); + + return (0); +} + +static device_method_t hvsock_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hvsock_probe), + DEVMETHOD(device_attach, hvsock_attach), + DEVMETHOD(device_detach, hvsock_detach), + DEVMETHOD_END +}; + +static driver_t hvsock_driver = { + "hv_sock", + hvsock_methods, + sizeof(struct hvsock_sc) +}; + +static devclass_t hvsock_devclass; + +DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); +MODULE_VERSION(hvsock, 1); +MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); Index: sys/dev/hyperv/include/vmbus.h =================================================================== --- sys/dev/hyperv/include/vmbus.h +++ sys/dev/hyperv/include/vmbus.h @@ -31,6 +31,7 @@ #include #include +#include /* * VMBUS version is 32 bit, upper 16 bit for major_number and lower @@ -130,6 +131,7 @@ struct taskqueue; typedef void (*vmbus_chan_callback_t)(struct vmbus_channel *, void *); +typedef int (*vmbus_br_copy_callback_t)(void *, int, void *); static __inline struct vmbus_channel * vmbus_get_channel(device_t dev) @@ -205,6 +207,14 @@ int vmbus_chan_recv_pkt(struct vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt, int *pktlen); +int vmbus_chan_recv_idxadv(struct vmbus_channel *chan, + uint32_t advance); +int vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance); +int vmbus_chan_recv_peek_call(struct vmbus_channel *chan, + int data_len, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg); + int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid); int vmbus_chan_send_sglist(struct vmbus_channel *chan, @@ -213,13 +223,30 @@ int vmbus_chan_send_prplist(struct vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid); +int vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg); +uint32_t vmbus_chan_write_available(struct vmbus_channel *chan); +uint32_t vmbus_chan_read_available(struct vmbus_channel *chan); +bool vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size); +void vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size); uint32_t vmbus_chan_id(const struct vmbus_channel *chan); uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan); bool vmbus_chan_is_primary(const struct vmbus_channel *chan); bool vmbus_chan_is_revoked(const struct vmbus_channel *chan); -const struct hyperv_guid * - vmbus_chan_guid_inst(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs_conn_from_host( + const struct vmbus_channel *chan); +int vmbus_req_tl_connect(struct hyperv_guid *, + struct hyperv_guid *); + +struct hyperv_guid * + vmbus_chan_guid_type(struct vmbus_channel *chan); +struct hyperv_guid * + vmbus_chan_guid_inst(struct vmbus_channel *chan); int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max); bool vmbus_chan_rx_empty(const struct vmbus_channel *chan); Index: sys/dev/hyperv/vmbus/vmbus.c =================================================================== --- sys/dev/hyperv/vmbus/vmbus.c +++ sys/dev/hyperv/vmbus/vmbus.c @@ -365,12 +365,48 @@ uint32_t gpadl; again: - gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); + gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); if (gpadl == 0) goto again; return (gpadl); } +/* Used for Hyper-V socket when guest client connects to host */ +int +vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id, + struct hyperv_guid *host_srv_id) +{ + struct vmbus_softc *sc = vmbus_get_softc(); + struct vmbus_chanmsg_tl_connect *req; + struct vmbus_msghc *mh; + int error; + + if (!sc) + return ENXIO; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + device_printf(sc->vmbus_dev, + "can not get msg hypercall for tl connect\n"); + return ENXIO; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN; + req->guest_endpoint_id = *guest_srv_id; + req->host_service_id = *host_srv_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + device_printf(sc->vmbus_dev, + "tl connect msg hypercall failed\n"); + } + + return error; +} + static int vmbus_connect(struct vmbus_softc *sc, uint32_t version) { Index: sys/dev/hyperv/vmbus/vmbus_br.c =================================================================== --- sys/dev/hyperv/vmbus/vmbus_br.c +++ sys/dev/hyperv/vmbus/vmbus_br.c @@ -52,18 +52,23 @@ vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS) { const struct vmbus_br *br = arg1; - uint32_t rindex, windex, imask, ravail, wavail; + uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail; + uint64_t intrcnt; char state[256]; + intrcnt = br->vbr_intrcnt; rindex = br->vbr_rindex; windex = br->vbr_windex; imask = br->vbr_imask; + psndsz = br->vbr_psndsz; + fvalue = br->vbr_fvalue; wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize); ravail = br->vbr_dsize - wavail; snprintf(state, sizeof(state), - "rindex:%u windex:%u imask:%u ravail:%u wavail:%u", - rindex, windex, imask, ravail, wavail); + "intrcnt:%lu rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u " + "ravail:%u wavail:%u", + intrcnt, rindex, windex, imask, psndsz, fvalue, ravail, wavail); return sysctl_handle_string(oidp, state, sizeof(state), req); } @@ -76,9 +81,11 @@ #define BR_STATE_RIDX 0 #define BR_STATE_WIDX 1 #define BR_STATE_IMSK 2 -#define BR_STATE_RSPC 3 -#define BR_STATE_WSPC 4 -#define BR_STATE_MAX 5 +#define BR_STATE_PSSZ 3 +#define BR_STATE_FVAL 4 +#define BR_STATE_RSPC 5 +#define BR_STATE_WSPC 6 +#define BR_STATE_MAX 7 const struct vmbus_br *br = arg1; uint32_t rindex, windex, wavail, state[BR_STATE_MAX]; @@ -90,6 +97,8 @@ state[BR_STATE_RIDX] = rindex; state[BR_STATE_WIDX] = windex; state[BR_STATE_IMSK] = br->vbr_imask; + state[BR_STATE_PSSZ] = br->vbr_psndsz; + state[BR_STATE_FVAL] = br->vbr_fvalue; state[BR_STATE_WSPC] = wavail; state[BR_STATE_RSPC] = br->vbr_dsize - wavail; @@ -140,6 +149,12 @@ } uint32_t +vmbus_rxbr_available(const struct vmbus_rxbr *rbr) +{ + return (vmbus_rxbr_avail(rbr)); +} + +uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr) { rbr->rxbr_imask = 0; @@ -178,6 +193,40 @@ vmbus_br_setup(&rbr->rxbr, buf, blen); } +static __inline boolean_t +vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read) +{ + uint32_t pending_snd_sz, canwrite_size; + + /* No need to signal if host doesn't want us to */ + if (!rbr->rxbr_fpsndsz) + return false; + + mb(); + + pending_snd_sz = rbr->rxbr_psndsz; + /* No need to signal if host sets pending_snd_sz to 0 */ + if (!pending_snd_sz) + return false; + + mb(); + + canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr); + + /* No need to signal if br already has enough space before read */ + if (canwrite_size - bytes_read > pending_snd_sz) + return false; + + /* + * No need to signal if still doesn't have enough space + * asked by host + */ + if (canwrite_size <= pending_snd_sz) + return false; + + return true; +} + void vmbus_txbr_init(struct vmbus_txbr *tbr) { @@ -194,8 +243,25 @@ vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen) { vmbus_br_setup(&tbr->txbr, buf, blen); + + /* Set feature bit enabling flow control */ + tbr->txbr_fpsndsz = 1; } +uint32_t +vmbus_txbr_get_imask(const struct vmbus_txbr *tbr) +{ + mb(); + + return(tbr->txbr_imask); +} + +void +vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size) +{ + tbr->txbr_psndsz = size; +} + /* * When we write to the ring buffer, check if the host needs to be * signaled. @@ -260,7 +326,117 @@ return VMBUS_BR_IDXINC(windex, cplen, br_dsize); } +static __inline uint32_t +vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex, + uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret) +{ + uint8_t *br_data = tbr->txbr_data; + uint32_t br_dsize = tbr->txbr_dsize; + int err = 0; + + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + err = cb((void *)(br_data + windex), fraglen, cbarg); + if (!err) + err = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + err = cb((void *)(br_data + windex), cplen, cbarg); + } + + *ret = err; + + return VMBUS_BR_IDXINC(windex, cplen, br_dsize); +} + +uint32_t +vmbus_txbr_available(const struct vmbus_txbr *tbr) +{ + return (vmbus_txbr_avail(tbr)); +} + /* + * NOTE: + * Not holding lock when calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig) +{ + uint32_t old_windex, windex, total; + uint64_t save_windex; + int i; + int cb_ret = 0; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + + /* + * NOTE: + * If this write is going to make br_windex same as br_rindex, + * i.e. the available space for write is same as the write size, + * we can't do it then, since br_windex == br_rindex means that + * the bufring is empty. + */ + if (vmbus_txbr_avail(tbr) <= total) { + return (EAGAIN); + } + + /* Save br_windex for later use */ + old_windex = tbr->txbr_windex; + + /* + * Copy the scattered channel packet to the TX bufring. + */ + windex = old_windex; + for (i = 0; i < iovlen; i++) { + if (iov[i].iov_base != NULL) { + windex = vmbus_txbr_copyto(tbr, windex, + iov[i].iov_base, iov[i].iov_len); + } else if (cb != NULL) { + windex = vmbus_txbr_copyto_call(tbr, windex, + iov[i].iov_len, cb, cbarg, &cb_ret); + /* + * If callback fails, return without updating + * write index. + */ + if (cb_ret) + return (cb_ret); + } + } + + mtx_lock_spin(&tbr->txbr_lock); + + /* + * Set the offset of the current channel packet. + */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* + * Update the write index _after_ the channel packet + * is copied. + */ + __compiler_membar(); + tbr->txbr_windex = windex; + + mtx_unlock_spin(&tbr->txbr_lock); + + if (need_sig) + *need_sig = vmbus_txbr_need_signal(tbr, old_windex); + + return (0); +} + +/* * Write scattered channel packet to TX bufring. * * The offset of this channel packet is written as a 64bits value @@ -346,6 +522,27 @@ return VMBUS_BR_IDXINC(rindex, cplen, br_dsize); } +static __inline uint32_t +vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex, + int cplen, vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint8_t *br_data = rbr->rxbr_data; + uint32_t br_dsize = rbr->rxbr_dsize; + int error = 0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + error = cb((void *)(br_data + rindex), fraglen, cbarg); + if (!error) + error = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + error = cb((void *)(br_data + rindex), cplen, cbarg); + } + return (error); +} + int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen) { @@ -368,6 +565,121 @@ /* * NOTE: + * We only hold spin lock to check the ring buffer space. It is + * released before calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint32_t rindex, br_dsize0 = rbr->rxbr_dsize; + int ret; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * The requested data + skip and the 64bits channel packet + * offset should be there at least. + */ + if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0); + mtx_unlock_spin(&rbr->rxbr_lock); + + ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg); + + return (ret); +} + +/* + * NOTE: + * We assume idx_adv == sizeof(channel packet). + */ +int +vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen, + uint32_t idx_adv, boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough data to read. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + if (idx_adv > 0) { + /* + * Advance the read index first, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + } + + vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen); + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + if (idx_adv > 0) + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + + sizeof(uint64_t)); + else + *need_sig = false; + } + + return (0); +} + +/* + * NOTE: + * Just update the RX rb index. + */ +int +vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough space to advance. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + /* + * Advance the read index, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t)); + } + + return (0); +} + +/* + * NOTE: * We assume (dlen + skip) == sizeof(channel packet). */ int Index: sys/dev/hyperv/vmbus/vmbus_brvar.h =================================================================== --- sys/dev/hyperv/vmbus/vmbus_brvar.h +++ sys/dev/hyperv/vmbus/vmbus_brvar.h @@ -44,6 +44,10 @@ #define vbr_windex vbr->br_windex #define vbr_rindex vbr->br_rindex #define vbr_imask vbr->br_imask +#define vbr_psndsz vbr->br_pending_snd_sz +#define vbr_fpsndsz vbr->br_feature_bits.feat_pending_snd_sz +#define vbr_fvalue vbr->br_feature_bits.value +#define vbr_intrcnt vbr->br_g2h_intr_cnt #define vbr_data vbr->br_data struct vmbus_rxbr { @@ -54,6 +58,10 @@ #define rxbr_windex rxbr.vbr_windex #define rxbr_rindex rxbr.vbr_rindex #define rxbr_imask rxbr.vbr_imask +#define rxbr_psndsz rxbr.vbr_psndsz +#define rxbr_fpsndsz rxbr.vbr_fpsndsz +#define rxbr_fvalue rxbr.vbr_fvalue +#define rxbr_intrcnt rxbr.vbr_intrcnt #define rxbr_data rxbr.vbr_data #define rxbr_dsize rxbr.vbr_dsize @@ -65,6 +73,10 @@ #define txbr_windex txbr.vbr_windex #define txbr_rindex txbr.vbr_rindex #define txbr_imask txbr.vbr_imask +#define txbr_psndsz txbr.vbr_psndsz +#define txbr_fpsndsz txbr.vbr_fpsndsz +#define txbr_fvalue txbr.vbr_fvalue +#define txbr_intrcnt txbr.vbr_intrcnt #define txbr_data txbr.vbr_data #define txbr_dsize txbr.vbr_dsize @@ -118,8 +130,15 @@ int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen); int vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip); +int vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig); +int vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, + int dlen, uint32_t idx_adv, boolean_t *need_sig); +int vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg); void vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr); uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr); +uint32_t vmbus_rxbr_available(const struct vmbus_rxbr *rbr); void vmbus_txbr_init(struct vmbus_txbr *tbr); void vmbus_txbr_deinit(struct vmbus_txbr *tbr); @@ -126,5 +145,13 @@ void vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen); int vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen, boolean_t *need_sig); +int vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig); +uint32_t vmbus_txbr_available(const struct vmbus_txbr *tbr); +uint32_t vmbus_txbr_get_imask(const struct vmbus_txbr *tbr); +void vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, + uint32_t size); #endif /* _VMBUS_BRVAR_H_ */ Index: sys/dev/hyperv/vmbus/vmbus_chan.c =================================================================== --- sys/dev/hyperv/vmbus/vmbus_chan.c +++ sys/dev/hyperv/vmbus/vmbus_chan.c @@ -127,10 +127,11 @@ }; /* - * Notify host that there are data pending on our TX bufring. + * Notify host that there are data pending on our TX bufring or + * we have put some data on the TX bufring. */ static __inline void -vmbus_chan_signal_tx(const struct vmbus_channel *chan) +vmbus_chan_signal(const struct vmbus_channel *chan) { atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask); if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) @@ -139,6 +140,22 @@ hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); } +static __inline void +vmbus_chan_signal_tx(struct vmbus_channel *chan) +{ + chan->ch_txbr.txbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + +static __inline void +vmbus_chan_signal_rx(struct vmbus_channel *chan) +{ + chan->ch_rxbr.rxbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + static void vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) { @@ -1012,7 +1029,60 @@ taskqueue_drain(chan->ch_tq, &chan->ch_task); } +uint32_t +vmbus_chan_write_available(struct vmbus_channel *chan) +{ + return (vmbus_txbr_available(&chan->ch_txbr)); +} + +bool +vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size) +{ + if (min_signal_size >= 0 && + vmbus_chan_write_available(chan) > min_signal_size) { + return false; + } + + if (!vmbus_txbr_get_imask(&chan->ch_txbr)) { + /* txbr imask is not set, signal the reader */ + vmbus_chan_signal_tx(chan); + return true; + } + + return false; +} + +void +vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size) +{ + if (chan) + vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size); +} + int +vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + int error; + boolean_t send_evt; + + if (iovlen == 0) + return (0); + + error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen, + cb, cbarg, &send_evt); + + if (!error && send_evt) { + vmbus_chan_signal_tx(chan); + } + + return error; +} + +int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid) { @@ -1211,6 +1281,78 @@ return (0); } +uint32_t +vmbus_chan_read_available(struct vmbus_channel *chan) +{ + return (vmbus_rxbr_available(&chan->ch_rxbr)); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + * - Copy data_len bytes in to the buffer pointed by 'data' + * Return 0 if operation succeed. EAGAIN if operations if failed. + * If failed, the buffer pointed by 'data' is intact, and the + * channel read index is not advanced at all. + */ +int +vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (data == NULL || data_len <= 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr, + data, data_len, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + */ +int +vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (advance == 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + + +/* + * Caller should hold its own lock to serialize the ring buffer + * copy. + */ +int +vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg) +{ + if (!chan || data_len <= 0 || cb == NULL) + return (EINVAL); + + return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip, + cb, cbarg)); +} + static void vmbus_chan_task(void *xchan, int pending __unused) { @@ -1732,6 +1874,25 @@ 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN); } + if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) { + /* This is HyperV socket channel */ + chan->ch_is_hvs = true; + /* The first byte != 0 means the host initiated connection. */ + chan->ch_hvs_conn_from_host = + offer->chm_udata.pipe.user_def[0]; + + if (bootverbose) { + device_printf(sc->vmbus_dev, + "chan%u is hyperv socket channel " + "connected %s host\n", + chan->ch_id, + (chan->ch_hvs_conn_from_host != 0) ? + "from" : "to"); + } + } else { + chan->ch_is_hvs = false; + } + /* * Setup event flag. */ @@ -2047,9 +2208,32 @@ return false; } -const struct hyperv_guid * -vmbus_chan_guid_inst(const struct vmbus_channel *chan) +bool +vmbus_chan_is_hvs(const struct vmbus_channel *chan) { + return chan->ch_is_hvs; +} + +bool +vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan) +{ + KASSERT(vmbus_chan_is_hvs(chan) == true, + ("Not a HyperV Socket channel %u", chan->ch_id)); + if (chan->ch_hvs_conn_from_host != 0) + return true; + else + return false; +} + +struct hyperv_guid * +vmbus_chan_guid_type(struct vmbus_channel *chan) +{ + return &chan->ch_guid_type; +} + +struct hyperv_guid * +vmbus_chan_guid_inst(struct vmbus_channel *chan) +{ return &chan->ch_guid_inst; } Index: sys/dev/hyperv/vmbus/vmbus_chanvar.h =================================================================== --- sys/dev/hyperv/vmbus/vmbus_chanvar.h +++ sys/dev/hyperv/vmbus/vmbus_chanvar.h @@ -149,6 +149,12 @@ int ch_refs; + /* + * These are for HyperV socket channel only + */ + bool ch_is_hvs; + uint8_t ch_hvs_conn_from_host; + struct sysctl_ctx_list ch_sysctl_ctx; } __aligned(CACHE_LINE_SIZE); Index: sys/dev/hyperv/vmbus/vmbus_reg.h =================================================================== --- sys/dev/hyperv/vmbus/vmbus_reg.h +++ sys/dev/hyperv/vmbus/vmbus_reg.h @@ -127,7 +127,54 @@ */ volatile uint32_t br_imask; - uint8_t br_rsvd[4084]; + /* + * WS2012/Win8 and later versions of Hyper-V implement interrupt + * driven flow management. The feature bit feat_pending_snd_sz + * is set by the host on the host->guest buffer ring, and by the + * guest on the guest->host buffer ring. + * + * The meaning of the feature bit is a bit complex in that it has + * semantics that apply to both buffer rings. If the guest sets + * the feature bit in the guest->host buffer ring, the guest is + * telling the host that: + * 1) It will set the br_pending_snd_sz field in the guest->host buffer + * ring when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the host->guest + * ring buffer and interrupt the host when it frees enough space + * + * Similarly, if the host sets the feature bit in the host->guest + * ring buffer, the host is telling the guest that: + * 1) It will set the pending_send_sz field in the host->guest ring + * buffer when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the guest->host + * ring buffer and interrupt the guest when it frees enough space + * + * If either the guest or host does not set the feature bit that it + * owns, that guest or host must do polling if it encounters a full + * ring buffer, and not signal the other end with an interrupt. + */ + volatile uint32_t br_pending_snd_sz; + uint32_t br_rsvd1[12]; + union { + struct { + uint32_t feat_pending_snd_sz:1; + }; + uint32_t value; + } br_feature_bits; + + /* Padding to PAGE_SIZE */ + uint8_t br_rsvd2[4020]; + + /* + * Total guest to host interrupt count + * - For rx ring, this counts the guest signaling host when this rx + * ring changing from full to not full. + * + * - For tx ring, this counts the guest signaling host when this tx + * ring changing from empty to non empty. + */ + uint64_t br_g2h_intr_cnt; + uint8_t br_data[]; } __packed; CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE); @@ -196,7 +243,14 @@ #define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */ #define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */ #define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */ -#define VMBUS_CHANMSG_TYPE_MAX 22 +#define VMBUS_CHANMSG_TYPE_17 17 +#define VMBUS_CHANMSG_TYPE_18 18 +#define VMBUS_CHANMSG_TYPE_19 19 +#define VMBUS_CHANMSG_TYPE_20 20 +#define VMBUS_CHANMSG_TYPE_TL_CONN 21 /* REQ */ +#define VMBUS_CHANMSG_TYPE_22 22 +#define VMBUS_CHANMSG_TYPE_TL_RESULT 23 /* RESP */ +#define VMBUS_CHANMSG_TYPE_MAX 24 struct vmbus_chanmsg_hdr { uint32_t chm_type; /* VMBUS_CHANMSG_TYPE_ */ @@ -229,6 +283,15 @@ struct vmbus_chanmsg_hdr chm_hdr; } __packed; +/* VMBUS_CHANMSG_TYPE_TL_CONN */ +/* Hyper-V socket guest connect request */ +struct vmbus_chanmsg_tl_connect { + struct vmbus_chanmsg_hdr chm_hdr; + struct hyperv_guid guest_endpoint_id; + struct hyperv_guid host_service_id; +} __packed; + + /* VMBUS_CHANMSG_TYPE_CHOPEN */ struct vmbus_chanmsg_chopen { struct vmbus_chanmsg_hdr chm_hdr; @@ -310,6 +373,12 @@ uint32_t chm_chanid; } __packed; +/* Size of the user defined data buffer for non-pipe offers */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE 120 + +/* Size of the user defined data buffer for pipe offers. */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE 116 + /* VMBUS_CHANMSG_TYPE_CHOFFER */ struct vmbus_chanmsg_choffer { struct vmbus_chanmsg_hdr chm_hdr; @@ -320,7 +389,26 @@ uint32_t chm_svrctx_sz; uint16_t chm_chflags; uint16_t chm_mmio_sz; /* unit: MB */ - uint8_t chm_udata[120]; + + union { + /* Non-pipes */ + struct { + uint8_t user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE]; + } std; + /* + * Pipes: + * For integrated pipe protocol, which is implemented on + * top of standard user-defined data. Pipe clients have + * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for + * their own user. + */ + struct { + uint32_t pipe_mode; + uint8_t + user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE]; + } pipe; + } chm_udata; + uint16_t chm_subidx; uint16_t chm_rsvd; uint32_t chm_chanid; @@ -331,6 +419,9 @@ } __packed; CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX); +/* Server Flag */ +#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER 0x2000 + #define VMBUS_CHOFFER_FLAG1_HASMNF 0x01 #endif /* !_VMBUS_REG_H_ */ Index: sys/modules/hyperv/Makefile =================================================================== --- sys/modules/hyperv/Makefile +++ sys/modules/hyperv/Makefile @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR = vmbus netvsc storvsc utilities +SUBDIR = vmbus netvsc storvsc utilities hvsock .include Index: sys/modules/hyperv/hvsock/Makefile =================================================================== --- sys/modules/hyperv/hvsock/Makefile +++ sys/modules/hyperv/hvsock/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/dev/hyperv/hvsock + +KMOD= hv_sock +SRCS= hv_sock.c +SRCS+= hv_sock.h + +CFLAGS+= -I${SRCTOP}/sys/dev/hyperv/include \ + -I${SRCTOP}/sys/dev/hyperv/vmbus \ + -I${SRCTOP}/sys/dev/hyperv/hvsock + +.include Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -265,7 +265,8 @@ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ -#define AF_MAX 42 +#define AF_HYPERV 43 /* HyperV sockets */ +#define AF_MAX 43 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ @@ -273,7 +274,6 @@ */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 -#define AF_VENDOR02 43 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49