diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c index b2a84befafa2..4bd98d154cbf 100644 --- a/sys/dev/hyperv/hvsock/hv_sock.c +++ b/sys/dev/hyperv/hvsock/hv_sock.c @@ -1,1760 +1,1760 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_sock.h" #define HVSOCK_DBG_NONE 0x0 #define HVSOCK_DBG_INFO 0x1 #define HVSOCK_DBG_ERR 0x2 #define HVSOCK_DBG_VERBOSE 0x3 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); static int hvs_dbg_level; SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); #define HVSOCK_DBG(level, ...) do { \ if (hvs_dbg_level >= (level)) \ printf(__VA_ARGS__); \ } while (0) MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); static int hvs_dom_probe(void); /* The MTU is 16KB per host side's design */ #define HVSOCK_MTU_SIZE (1024 * 16) #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ roundup2(payload_len, 8) + \ sizeof(uint64_t)) static struct domain hv_socket_domain; /* * HyperV Transport sockets */ static struct pr_usrreqs hvs_trans_usrreqs = { .pru_attach = hvs_trans_attach, .pru_bind = hvs_trans_bind, .pru_listen = hvs_trans_listen, .pru_accept = hvs_trans_accept, .pru_connect = hvs_trans_connect, .pru_peeraddr = hvs_trans_peeraddr, .pru_sockaddr = hvs_trans_sockaddr, .pru_soreceive = hvs_trans_soreceive, .pru_sosend = hvs_trans_sosend, .pru_disconnect = hvs_trans_disconnect, .pru_close = hvs_trans_close, .pru_detach = hvs_trans_detach, .pru_shutdown = hvs_trans_shutdown, .pru_abort = hvs_trans_abort, }; /* * Definitions of protocols supported in HyperV socket domain */ static struct protosw hv_socket_protosw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &hv_socket_domain, .pr_protocol = HYPERV_SOCK_PROTO_TRANS, .pr_flags = PR_CONNREQUIRED, .pr_usrreqs = &hvs_trans_usrreqs, }, }; static struct domain hv_socket_domain = { .dom_family = AF_HYPERV, .dom_name = "hyperv", .dom_probe = hvs_dom_probe, .dom_protosw = hv_socket_protosw, .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] }; -VNET_DOMAIN_SET(hv_socket_); +DOMAIN_SET(hv_socket_); #define MAX_PORT ((uint32_t)0xFFFFFFFF) #define MIN_PORT ((uint32_t)0x0) /* 00000000-facb-11e6-bd58-64006a7986d3 */ static const struct hyperv_guid srv_id_template = { .hv_guid = { 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } }; static int hvsock_br_callback(void *, int, void *); static uint32_t hvsock_canread_check(struct hvs_pcb *); static uint32_t hvsock_canwrite_check(struct hvs_pcb *); static int hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, uint32_t to_write, struct sockbuf *sb); /* Globals */ static struct sx hvs_trans_socks_sx; static struct mtx hvs_trans_socks_mtx; static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; static uint32_t previous_auto_bound_port; static void hvsock_print_guid(struct hyperv_guid *guid) { unsigned char *p = (unsigned char *)guid; HVSOCK_DBG(HVSOCK_DBG_INFO, "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", *(unsigned int *)p, *((unsigned short *) &p[4]), *((unsigned short *) &p[6]), p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); } static bool is_valid_srv_id(const struct hyperv_guid *id) { return !memcmp(&id->hv_guid[4], &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); } static unsigned int get_port_by_srv_id(const struct hyperv_guid *srv_id) { return *((const unsigned int *)srv_id); } static void set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) { *((unsigned int *)srv_id) = port; } static void __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) { struct hvs_pcb *p = NULL; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); if (!pcb) return; if (list & HVS_LIST_BOUND) { LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) if (p == pcb) LIST_REMOVE(p, bound_next); } if (list & HVS_LIST_CONNECTED) { LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) if (p == pcb) LIST_REMOVE(pcb, connected_next); } } static void __hvs_remove_socket_from_list(struct socket *so, unsigned char list) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); __hvs_remove_pcb_from_list(pcb, list); } static void __hvs_insert_socket_on_list(struct socket *so, unsigned char list) { struct hvs_pcb *pcb = so2hvspcb(so); if (list & HVS_LIST_BOUND) LIST_INSERT_HEAD(&hvs_trans_bound_socks, pcb, bound_next); if (list & HVS_LIST_CONNECTED) LIST_INSERT_HEAD(&hvs_trans_connected_socks, pcb, connected_next); } void hvs_remove_socket_from_list(struct socket *so, unsigned char list) { if (!so || !so->so_pcb) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: socket or so_pcb is null\n", __func__); return; } mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_socket_from_list(so, list); mtx_unlock(&hvs_trans_socks_mtx); } static void hvs_insert_socket_on_list(struct socket *so, unsigned char list) { if (!so || !so->so_pcb) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: socket or so_pcb is null\n", __func__); return; } mtx_lock(&hvs_trans_socks_mtx); __hvs_insert_socket_on_list(so, list); mtx_unlock(&hvs_trans_socks_mtx); } static struct socket * __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) { struct hvs_pcb *p = NULL; if (list & HVS_LIST_BOUND) LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) if (p->so != NULL && addr->hvs_port == p->local_addr.hvs_port) return p->so; if (list & HVS_LIST_CONNECTED) LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) if (p->so != NULL && addr->hvs_port == p->local_addr.hvs_port) return p->so; return NULL; } static struct socket * hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) { struct socket *s = NULL; mtx_lock(&hvs_trans_socks_mtx); s = __hvs_find_socket_on_list(addr, list); mtx_unlock(&hvs_trans_socks_mtx); return s; } static inline void hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) { memset(addr, 0, sizeof(*addr)); addr->sa_family = AF_HYPERV; addr->sa_len = sizeof(*addr); addr->hvs_port = port; } void hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) { hvs_addr_set(addr, get_port_by_srv_id(svr_id)); } int hvs_trans_lock(void) { sx_xlock(&hvs_trans_socks_sx); return (0); } void hvs_trans_unlock(void) { sx_xunlock(&hvs_trans_socks_sx); } static int hvs_dom_probe(void) { /* Don't even give us a chance to attach on non-HyperV. */ if (vm_guest != VM_GUEST_HV) return (ENXIO); return (0); } static void hvs_trans_init(void *arg __unused) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_init called\n", __func__); /* Initialize Globals */ previous_auto_bound_port = MAX_PORT; sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); mtx_init(&hvs_trans_socks_mtx, "hvs_trans_socks_mtx", NULL, MTX_DEF); LIST_INIT(&hvs_trans_bound_socks); LIST_INIT(&hvs_trans_connected_socks); } SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, hvs_trans_init, NULL); /* * Called in two cases: * 1) When user calls socket(); * 2) When we accept new incoming conneciton and call sonewconn(). */ int hvs_trans_attach(struct socket *so, int proto, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_attach called\n", __func__); if (so->so_type != SOCK_STREAM) return (ESOCKTNOSUPPORT); if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) return (EPROTONOSUPPORT); if (pcb != NULL) return (EISCONN); pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); if (pcb == NULL) return (ENOMEM); pcb->so = so; so->so_pcb = (void *)pcb; return (0); } void hvs_trans_detach(struct socket *so) { struct hvs_pcb *pcb; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_detach called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (pcb == NULL) { hvs_trans_unlock(); return; } if (SOLISTENING(so)) { bzero(pcb, sizeof(*pcb)); free(pcb, M_HVSOCK); } so->so_pcb = NULL; hvs_trans_unlock(); } int hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; int error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_bind called\n", __func__); if (sa == NULL) { return (EINVAL); } if (pcb == NULL) { return (EINVAL); } if (sa->sa_family != AF_HYPERV) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Not supported, sa_family is %u\n", __func__, sa->sa_family); return (EAFNOSUPPORT); } if (sa->sa_len != sizeof(*sa)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Not supported, sa_len is %u\n", __func__, sa->sa_len); return (EINVAL); } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: binding port = 0x%x\n", __func__, sa->hvs_port); mtx_lock(&hvs_trans_socks_mtx); if (__hvs_find_socket_on_list(sa, HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { error = EADDRINUSE; } else { /* * The address is available for us to bind. * Add socket to the bound list. */ hvs_addr_set(&pcb->local_addr, sa->hvs_port); hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); } mtx_unlock(&hvs_trans_socks_mtx); return (error); } int hvs_trans_listen(struct socket *so, int backlog, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct socket *bound_so; int error; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_listen called\n", __func__); if (pcb == NULL) return (EINVAL); /* Check if the address is already bound and it was by us. */ bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); if (bound_so == NULL || bound_so != so) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: Address not bound or not by us.\n", __func__); return (EADDRNOTAVAIL); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) solisten_proto(so, backlog); SOCK_UNLOCK(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket listen error = %d\n", __func__, error); return (error); } int hvs_trans_accept(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_accept called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); return ((*nam == NULL) ? ENOMEM : 0); } int hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; bool found_auto_bound_port = false; int i, error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", __func__, raddr->hvs_port); if (pcb == NULL) return (EINVAL); /* Verify the remote address */ if (raddr == NULL) return (EINVAL); if (raddr->sa_family != AF_HYPERV) return (EAFNOSUPPORT); if (raddr->sa_len != sizeof(*raddr)) return (EINVAL); mtx_lock(&hvs_trans_socks_mtx); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: socket connect in progress\n", __func__); error = EINPROGRESS; goto out; } /* * Find an available port for us to auto bind the local * address. */ hvs_addr_set(&pcb->local_addr, 0); for (i = previous_auto_bound_port - 1; i != previous_auto_bound_port; i --) { if (i == MIN_PORT) i = MAX_PORT; pcb->local_addr.hvs_port = i; if (__hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { found_auto_bound_port = true; previous_auto_bound_port = i; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: found local bound port is %x\n", __func__, pcb->local_addr.hvs_port); break; } } if (found_auto_bound_port == true) { /* Found available port for auto bound, put on list */ __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); /* Set VM service ID */ pcb->vm_srv_id = srv_id_template; set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); /* Set host service ID and remote port */ pcb->host_srv_id = srv_id_template; set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); /* Change the socket state to SS_ISCONNECTING */ soisconnecting(so); } else { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: No local port available for auto bound\n", __func__); error = EADDRINUSE; } HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); hvsock_print_guid(&pcb->vm_srv_id); HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); hvsock_print_guid(&pcb->host_srv_id); out: mtx_unlock(&hvs_trans_socks_mtx); if (found_auto_bound_port == true) vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); return (error); } int hvs_trans_disconnect(struct socket *so) { struct hvs_pcb *pcb; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (pcb == NULL) { hvs_trans_unlock(); return (EINVAL); } /* If socket is already disconnected, skip this */ if ((so->so_state & SS_ISDISCONNECTED) == 0) soisdisconnecting(so); hvs_trans_unlock(); return (0); } struct hvs_callback_arg { struct uio *uio; struct sockbuf *sb; }; int hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; ssize_t orig_resid; uint32_t canread, to_read; int flags, error = 0; struct hvs_callback_arg cbarg; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); if (so->so_type != SOCK_STREAM) return (EINVAL); if (pcb == NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_PEEK) return (EOPNOTSUPP); /* If no space to copy out anything */ if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) return (EINVAL); orig_resid = uio->uio_resid; /* Prevent other readers from entering the socket. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: soiolock returned error = %d\n", __func__, error); return (error); } sb = &so->so_rcv; SOCKBUF_LOCK(sb); cbarg.uio = uio; cbarg.sb = sb; /* * If the socket is closing, there might still be some data * in rx br to read. However we need to make sure * the channel is still open. */ if ((sb->sb_state & SBS_CANTRCVMORE) && (so->so_state & SS_ISDISCONNECTED)) { /* Other thread already closed the channel */ error = EPIPE; goto out; } while (true) { while (uio->uio_resid > 0 && (canread = hvsock_canread_check(pcb)) > 0) { to_read = MIN(canread, uio->uio_resid); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: to_read = %u, skip = %u\n", __func__, to_read, (unsigned int)(sizeof(struct hvs_pkt_header) + pcb->recv_data_off)); error = vmbus_chan_recv_peek_call(pcb->chan, to_read, sizeof(struct hvs_pkt_header) + pcb->recv_data_off, hvsock_br_callback, (void *)&cbarg); /* * It is possible socket is disconnected becasue * we released lock in hvsock_br_callback. So we * need to check the state to make sure it is not * disconnected. */ if (error || so->so_state & SS_ISDISCONNECTED) { break; } pcb->recv_data_len -= to_read; pcb->recv_data_off += to_read; } if (error) break; /* Abort if socket has reported problems. */ if (so->so_error) { if (so->so_error == ESHUTDOWN && orig_resid > uio->uio_resid) { /* * Although we got a FIN, we also received * some data in this round. Delivery it * to user. */ error = 0; } else { if (so->so_error != ESHUTDOWN) error = so->so_error; } break; } /* Cannot received more. */ if (sb->sb_state & SBS_CANTRCVMORE) break; /* We are done if buffer has been filled */ if (uio->uio_resid == 0) break; if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) break; /* Buffer ring is empty and we shall not block */ if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { if (orig_resid == uio->uio_resid) { /* We have not read anything */ error = EAGAIN; } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: non blocked read return, error %d.\n", __func__, error); break; } /* * Wait and block until (more) data comes in. * Note: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) break; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: wake up from sbwait, read available is %u\n", __func__, vmbus_chan_read_available(pcb->chan)); } out: SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); /* We recieved a FIN in this call */ if (so->so_error == ESHUTDOWN) { if (so->so_snd.sb_state & SBS_CANTSENDMORE) { /* Send has already closed */ soisdisconnecting(so); } else { /* Just close the receive side */ socantrcvmore(so); } } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: returning error = %d, so_error = %d\n", __func__, error, so->so_error); return (error); } int hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; ssize_t orig_resid; uint32_t canwrite, to_write; int error = 0; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", __func__, uio->uio_resid); if (so->so_type != SOCK_STREAM) return (EINVAL); if (pcb == NULL) return (EINVAL); /* If nothing to send */ if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) return (EINVAL); orig_resid = uio->uio_resid; /* Prevent other writers from entering the socket. */ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: soiolocak returned error = %d\n", __func__, error); return (error); } sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) || so->so_error == ESHUTDOWN) { error = EPIPE; goto out; } while (uio->uio_resid > 0) { canwrite = hvsock_canwrite_check(pcb); if (canwrite == 0) { /* We have sent some data */ if (orig_resid > uio->uio_resid) break; /* * We have not sent any data and it is * non-blocked io */ if (so->so_state & SS_NBIO || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { error = EWOULDBLOCK; break; } else { /* * We are here because there is no space on * send buffer ring. Signal the other side * to read and free more space. * Sleep wait until space avaiable to send * Note: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) break; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: wake up from sbwait, space avail on " "tx ring is %u\n", __func__, vmbus_chan_write_available(pcb->chan)); continue; } } to_write = MIN(canwrite, uio->uio_resid); to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canwrite is %u, to_write = %u\n", __func__, canwrite, to_write); error = hvsock_send_data(pcb->chan, uio, to_write, sb); if (error) break; } out: SOCKBUF_UNLOCK(sb); SOCK_IO_SEND_UNLOCK(so); return (error); } int hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } int hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); if (pcb == NULL) return (EINVAL); *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); return ((*nam == NULL)? ENOMEM : 0); } void hvs_trans_close(struct socket *so) { struct hvs_pcb *pcb; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_close called\n", __func__); (void) hvs_trans_lock(); pcb = so2hvspcb(so); if (!pcb) { hvs_trans_unlock(); return; } if (so->so_state & SS_ISCONNECTED) { /* Send a FIN to peer */ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: hvs_trans_close sending a FIN to host\n", __func__); (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); } if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) soisdisconnected(so); pcb->chan = NULL; pcb->so = NULL; if (SOLISTENING(so)) { mtx_lock(&hvs_trans_socks_mtx); /* Remove from bound list */ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); mtx_unlock(&hvs_trans_socks_mtx); } hvs_trans_unlock(); return; } void hvs_trans_abort(struct socket *so) { struct hvs_pcb *pcb = so2hvspcb(so); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_abort called\n", __func__); (void) hvs_trans_lock(); if (pcb == NULL) { hvs_trans_unlock(); return; } if (SOLISTENING(so)) { mtx_lock(&hvs_trans_socks_mtx); /* Remove from bound list */ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); mtx_unlock(&hvs_trans_socks_mtx); } if (so->so_state & SS_ISCONNECTED) { (void) sodisconnect(so); } hvs_trans_unlock(); return; } int hvs_trans_shutdown(struct socket *so) { struct hvs_pcb *pcb = so2hvspcb(so); struct sockbuf *sb; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); if (pcb == NULL) return (EINVAL); /* * Only get called with the shutdown method is SHUT_WR or * SHUT_RDWR. * When the method is SHUT_RD or SHUT_RDWR, the caller * already set the SBS_CANTRCVMORE on receive side socket * buffer. */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { /* * SHUT_WR only case. * Receive side is still open. Just close * the send side. */ socantsendmore(so); } else { /* SHUT_RDWR case */ if (so->so_state & SS_ISCONNECTED) { /* Send a FIN to peer */ sb = &so->so_snd; SOCKBUF_LOCK(sb); (void) hvsock_send_data(pcb->chan, NULL, 0, sb); SOCKBUF_UNLOCK(sb); soisdisconnecting(so); } } return (0); } /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is * (see struct sockaddr_hvs). * * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- * guide/make-integration-service, and the endpoint is with * the below sockaddr: * * struct SOCKADDR_HV * { * ADDRESS_FAMILY Family; * USHORT Reserved; * GUID VmId; * GUID ServiceId; * }; * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via * VMBus, because here it's obvious the host and the VM can easily identify * each other. Though the VmID is useful on the host, especially in the case * of Windows container, FreeBSD VM doesn't need it at all. * * To be compatible with similar infrastructure in Linux VMs, we have * to limit the available GUID space of SOCKADDR_HV so that we can create * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: * **************************************************************************** * The only valid Service GUIDs, from the perspectives of both the host and * * FreeBSD VM, that can be connected by the other end, must conform to this * * format: -facb-11e6-bd58-64006a7986d3. * **************************************************************************** * * When we write apps on the host to connect(), the GUID ServiceID is used. * When we write apps in FreeBSD VM to connect(), we only need to specify the * port and the driver will form the GUID and use that to request the host. * * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the * auto-generated remote port for a connect request initiated by the host's * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the * FreeBSD guest. */ /* * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) * restricts HyperV socket ring buffer size to six 4K pages. Newer * HyperV hosts doen't have this limit. */ #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) struct hvsock_sc { device_t dev; struct hvs_pcb *pcb; struct vmbus_channel *channel; }; static bool hvsock_chan_readable(struct vmbus_channel *chan) { uint32_t readable = vmbus_chan_read_available(chan); return (readable >= HVSOCK_PKT_LEN(0)); } static void hvsock_chan_cb(struct vmbus_channel *chan, void *context) { struct hvs_pcb *pcb = (struct hvs_pcb *) context; struct socket *so; uint32_t canwrite; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: host send us a wakeup on rb data, pcb = %p\n", __func__, pcb); /* * Check if the socket is still attached and valid. * Here we know channel is still open. Need to make * sure the socket has not been closed or freed. */ (void) hvs_trans_lock(); so = hsvpcb2so(pcb); if (pcb->chan != NULL && so != NULL) { /* * Wake up reader if there are data to read. */ SOCKBUF_LOCK(&(so)->so_rcv); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: read available = %u\n", __func__, vmbus_chan_read_available(pcb->chan)); if (hvsock_chan_readable(pcb->chan)) sorwakeup_locked(so); else SOCKBUF_UNLOCK(&(so)->so_rcv); /* * Wake up sender if space becomes available to write. */ SOCKBUF_LOCK(&(so)->so_snd); canwrite = hvsock_canwrite_check(pcb); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canwrite = %u\n", __func__, canwrite); if (canwrite > 0) { sowwakeup_locked(so); } else { SOCKBUF_UNLOCK(&(so)->so_snd); } } hvs_trans_unlock(); return; } static int hvsock_br_callback(void *datap, int cplen, void *cbarg) { struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; struct uio *uio = arg->uio; struct sockbuf *sb = arg->sb; int error = 0; if (cbarg == NULL || datap == NULL) return (EINVAL); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " "datap = %p\n", __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", uio->uio_resid, cplen, datap); if (sb) SOCKBUF_UNLOCK(sb); error = uiomove(datap, cplen, uio); if (sb) SOCKBUF_LOCK(sb); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: after uiomove, uio_resid = %zd, error = %d\n", __func__, uio->uio_resid, error); return (error); } static int hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, uint32_t to_write, struct sockbuf *sb) { struct hvs_pkt_header hvs_pkt; int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; uint64_t pad = 0; struct iovec iov[3]; struct hvs_callback_arg cbarg; if (chan == NULL) return (ENOTCONN); hlen = sizeof(struct vmbus_chanpkt_hdr); hvs_pkthlen = sizeof(struct hvs_pkt_header); hvs_pktlen = hvs_pkthlen + to_write; pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " "pad_pktlen = %u, data_len = %u\n", __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; hvs_pkt.chan_pkt_hdr.cph_flags = 0; VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); hvs_pkt.chan_pkt_hdr.cph_xactid = 0; hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; cbarg.uio = uio; cbarg.sb = sb; if (uio && to_write > 0) { iov[0].iov_base = &hvs_pkt; iov[0].iov_len = hvs_pkthlen; iov[1].iov_base = NULL; iov[1].iov_len = to_write; iov[2].iov_base = &pad; iov[2].iov_len = pad_pktlen - hvs_pktlen; error = vmbus_chan_iov_send(chan, iov, 3, hvsock_br_callback, &cbarg); } else { if (to_write == 0) { iov[0].iov_base = &hvs_pkt; iov[0].iov_len = hvs_pkthlen; iov[1].iov_base = &pad; iov[1].iov_len = pad_pktlen - hvs_pktlen; error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); } } if (error) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: error = %d\n", __func__, error); } return (error); } /* * Check if we have data on current ring buffer to read * or not. If not, advance the ring buffer read index to * next packet. Update the recev_data_len and recev_data_off * to new value. * Return the number of bytes can read. */ static uint32_t hvsock_canread_check(struct hvs_pcb *pcb) { uint32_t advance; uint32_t tlen, hlen, dlen; uint32_t bytes_canread = 0; int error; if (pcb == NULL || pcb->chan == NULL) { pcb->so->so_error = EIO; return (0); } /* Still have data not read yet on current packet */ if (pcb->recv_data_len > 0) return (pcb->recv_data_len); if (pcb->rb_init) advance = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); else advance = 0; bytes_canread = vmbus_chan_read_available(pcb->chan); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: bytes_canread on br = %u, advance = %u\n", __func__, bytes_canread, advance); if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { /* * Nothing to read. Need to advance the rindex before * calling sbwait, so host knows to wake us up when data * is available to read on rb. */ error = vmbus_chan_recv_idxadv(pcb->chan, advance); if (error) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: after calling vmbus_chan_recv_idxadv, " "got error = %d\n", __func__, error); return (0); } else { pcb->rb_init = false; pcb->recv_data_len = 0; pcb->recv_data_off = 0; bytes_canread = vmbus_chan_read_available(pcb->chan); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: advanced %u bytes, " " bytes_canread on br now = %u\n", __func__, advance, bytes_canread); if (bytes_canread == 0) return (0); else advance = 0; } } if (bytes_canread < advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) return (0); error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, sizeof(struct hvs_pkt_header), advance); /* Don't have anything to read */ if (error) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: after calling vmbus_chan_recv_peek, got error = %d\n", __func__, error); return (0); } /* * We just read in a new packet header. Do some sanity checks. */ tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || __predict_false(hlen > tlen) || __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "invalid tlen(%u), hlen(%u) or dlen(%u)\n", tlen, hlen, dlen); pcb->so->so_error = EIO; return (0); } if (pcb->rb_init == false) pcb->rb_init = true; HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", tlen, hlen, dlen); /* The other side has sent a close FIN */ if (dlen == 0) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: Received FIN from other side\n", __func__); /* inform the caller by seting so_error to ESHUTDOWN */ pcb->so->so_error = ESHUTDOWN; } HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: canread on receive ring is %u \n", __func__, dlen); pcb->recv_data_len = dlen; pcb->recv_data_off = 0; return (pcb->recv_data_len); } static uint32_t hvsock_canwrite_check(struct hvs_pcb *pcb) { uint32_t writeable; uint32_t ret; if (pcb == NULL || pcb->chan == NULL) return (0); writeable = vmbus_chan_write_available(pcb->chan); /* * We must always reserve a 0-length-payload packet for the FIN. */ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: writeable is %u, should be greater than %ju\n", __func__, writeable, (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { /* * The Tx ring seems full. */ return (0); } ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: available size is %u\n", __func__, rounddown2(ret, 8)); return (rounddown2(ret, 8)); } static void hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) { vmbus_chan_set_pending_send_size(chan, HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); } static int hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) { unsigned int rcvbuf, sndbuf; struct hvs_pcb *pcb = so2hvspcb(so); int ret; if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { sndbuf = HVS_RINGBUF_SND_SIZE; rcvbuf = HVS_RINGBUF_RCV_SIZE; } else { sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); sndbuf = rounddown2(sndbuf, PAGE_SIZE); rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); } /* * Can only read whatever user provided size of data * from ring buffer. Turn off batched reading. */ vmbus_chan_set_readbatch(chan, false); ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, hvsock_chan_cb, pcb); if (ret != 0) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: failed to open hvsock channel, sndbuf = %u, " "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); } else { HVSOCK_DBG(HVSOCK_DBG_INFO, "%s: hvsock channel opened, sndbuf = %u, i" "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); /* * Se the pending send size so to receive wakeup * signals from host when there is enough space on * rx buffer ring to write. */ hvsock_set_chan_pending_send_size(chan); } return ret; } /* * Guest is listening passively on the socket. Open channel and * create a new socket for the conneciton. */ static void hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, struct hvsock_sc *sc) { struct socket *new_so; struct hvs_pcb *new_pcb, *pcb; int error; /* Do nothing if socket is not listening */ if (!SOLISTENING(so)) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: socket is not a listening one\n", __func__); return; } /* * Create a new socket. This will call pru_attach to complete * the socket initialization and put the new socket onto * listening socket's sol_incomp list, waiting to be promoted * to sol_comp list. * The new socket created has ref count 0. There is no other * thread that changes the state of this new one at the * moment, so we don't need to hold its lock while opening * channel and filling out its pcb information. */ new_so = sonewconn(so, 0); if (!new_so) HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: creating new socket failed\n", __func__); /* * Now open the vmbus channel. If it fails, the socket will be * on the listening socket's sol_incomp queue until it is * replaced and aborted. */ error = hvsock_open_channel(chan, new_so); if (error) { new_so->so_error = error; return; } pcb = so->so_pcb; new_pcb = new_so->so_pcb; hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); /* Remote port is unknown to guest in this type of conneciton */ hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); new_pcb->chan = chan; new_pcb->recv_data_len = 0; new_pcb->recv_data_off = 0; new_pcb->rb_init = false; new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); sc->pcb = new_pcb; /* * Change the socket state to SS_ISCONNECTED. This will promote * the socket to sol_comp queue and wake up the thread which * is accepting connection. */ soisconnected(new_so); } /* * Guest is actively connecting to host. */ static void hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) { struct hvs_pcb *pcb; int error; error = hvsock_open_channel(chan, so); if (error) { so->so_error = error; return; } pcb = so->so_pcb; pcb->chan = chan; pcb->recv_data_len = 0; pcb->recv_data_off = 0; pcb->rb_init = false; mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); mtx_unlock(&hvs_trans_socks_mtx); /* * Change the socket state to SS_ISCONNECTED. This will wake up * the thread sleeping in connect call. */ soisconnected(so); } static void hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) { struct hyperv_guid *inst_guid, *type_guid; bool conn_from_host; struct sockaddr_hvs addr; struct socket *so; struct hvs_pcb *pcb; type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); hvsock_print_guid(type_guid); HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); hvsock_print_guid(inst_guid); HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", (conn_from_host == true ) ? "from" : "to"); /* * The listening port should be in [0, MAX_LISTEN_PORT] */ if (!is_valid_srv_id(type_guid)) return; /* * There should be a bound socket already created no matter * it is a passive or active connection. * For host initiated connection (passive on guest side), * the type_guid contains the port which guest is bound and * listening. * For the guest initiated connection (active on guest side), * the inst_guid contains the port that guest has auto bound * to. */ hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); if (!so) { HVSOCK_DBG(HVSOCK_DBG_ERR, "%s: no bound socket found for port %u\n", __func__, addr.hvs_port); return; } if (conn_from_host) { hvsock_open_conn_passive(chan, so, sc); } else { (void) hvs_trans_lock(); pcb = so->so_pcb; if (pcb && pcb->so) { sc->pcb = so2hvspcb(so); hvsock_open_conn_active(chan, so); } else { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: channel detached before open\n", __func__); } hvs_trans_unlock(); } } static int hvsock_probe(device_t dev) { struct vmbus_channel *channel = vmbus_get_channel(dev); if (!channel || !vmbus_chan_is_hvs(channel)) { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_probe called but not a hvsock channel id %u\n", vmbus_chan_id(channel)); return ENXIO; } else { HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_probe got a hvsock channel id %u\n", vmbus_chan_id(channel)); return BUS_PROBE_DEFAULT; } } static int hvsock_attach(device_t dev) { struct vmbus_channel *channel = vmbus_get_channel(dev); struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); hvsock_open_connection(channel, sc); /* * Always return success. On error the host will rescind the device * in 30 seconds and we can do cleanup at that time in * vmbus_chan_msgproc_chrescind(). */ return (0); } static int hvsock_detach(device_t dev) { struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); struct socket *so; int retry; if (bootverbose) device_printf(dev, "hvsock_detach called.\n"); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); if (sc->pcb != NULL) { (void) hvs_trans_lock(); so = hsvpcb2so(sc->pcb); if (so) { /* Close the connection */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) soisdisconnected(so); } mtx_lock(&hvs_trans_socks_mtx); __hvs_remove_pcb_from_list(sc->pcb, HVS_LIST_BOUND | HVS_LIST_CONNECTED); mtx_unlock(&hvs_trans_socks_mtx); /* * Close channel while no reader and sender are working * on the buffer rings. */ if (so) { retry = 0; while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { /* * Someone is reading, rx br is busy */ soisdisconnected(so); DELAY(500); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "waiting for rx reader to exit, " "retry = %d\n", retry++); } retry = 0; while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { /* * Someone is sending, tx br is busy */ soisdisconnected(so); DELAY(500); HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "waiting for tx sender to exit, " "retry = %d\n", retry++); } } bzero(sc->pcb, sizeof(struct hvs_pcb)); free(sc->pcb, M_HVSOCK); sc->pcb = NULL; if (so) { SOCK_IO_RECV_UNLOCK(so); SOCK_IO_SEND_UNLOCK(so); so->so_pcb = NULL; } hvs_trans_unlock(); } vmbus_chan_close(vmbus_get_channel(dev)); return (0); } static device_method_t hvsock_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hvsock_probe), DEVMETHOD(device_attach, hvsock_attach), DEVMETHOD(device_detach, hvsock_detach), DEVMETHOD_END }; static driver_t hvsock_driver = { "hv_sock", hvsock_methods, sizeof(struct hvsock_sc) }; static devclass_t hvsock_devclass; DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); MODULE_VERSION(hvsock, 1); MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index 832db9f6e667..dc4a1541319c 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -1,542 +1,514 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization * * Note: domain initialization takes place on a per domain basis * as a result of traversing a SYSINIT linker set. Most likely, * each domain would want to call DOMAIN_SET(9) itself, which * would cause the domain to be added just after domaininit() * is called during startup. * * See DOMAIN_SET(9) for details on its use. */ static void domaininit(void *); SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL); static void domainfinalize(void *); SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL); static struct callout pffast_callout; static struct callout pfslow_callout; static void pffasttimo(void *); static void pfslowtimo(void *); static struct rmlock pftimo_lock; RM_SYSINIT(pftimo_lock, &pftimo_lock, "pftimo"); static LIST_HEAD(, protosw) pffast_list = LIST_HEAD_INITIALIZER(pffast_list); static LIST_HEAD(, protosw) pfslow_list = LIST_HEAD_INITIALIZER(pfslow_list); struct domain *domains; /* registered protocol domains */ int domain_init_status = 0; static struct mtx dom_mtx; /* domain list lock */ MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF); /* * Dummy protocol specific user requests function pointer array. * All functions return EOPNOTSUPP. */ struct pr_usrreqs nousrreqs = { .pru_accept = pru_accept_notsupp, .pru_attach = pru_attach_notsupp, .pru_bind = pru_bind_notsupp, .pru_connect = pru_connect_notsupp, .pru_connect2 = pru_connect2_notsupp, .pru_control = pru_control_notsupp, .pru_disconnect = pru_disconnect_notsupp, .pru_listen = pru_listen_notsupp, .pru_peeraddr = pru_peeraddr_notsupp, .pru_rcvd = pru_rcvd_notsupp, .pru_rcvoob = pru_rcvoob_notsupp, .pru_send = pru_send_notsupp, .pru_sense = pru_sense_null, .pru_shutdown = pru_shutdown_notsupp, .pru_sockaddr = pru_sockaddr_notsupp, .pru_sosend = pru_sosend_notsupp, .pru_soreceive = pru_soreceive_notsupp, .pru_sopoll = pru_sopoll_notsupp, }; static void pr_usrreqs_init(struct protosw *pr) { struct pr_usrreqs *pu; pu = pr->pr_usrreqs; KASSERT(pu != NULL, ("%s: %ssw[%d] has no usrreqs!", __func__, pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); /* * Protocol switch methods fall into three categories: mandatory, * mandatory but protosw_init() provides a default, and optional. * * For true protocols (i.e., pru_attach != NULL), KASSERT truly * mandatory methods with no defaults, and initialize defaults for * other mandatory methods if the protocol hasn't defined an * implementation (NULL function pointer). */ #if 0 if (pu->pru_attach != NULL) { KASSERT(pu->pru_abort != NULL, ("protosw_init: %ssw[%d] pru_abort NULL", pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); KASSERT(pu->pru_send != NULL, ("protosw_init: %ssw[%d] pru_send NULL", pr->pr_domain->dom_name, (int)(pr - pr->pr_domain->dom_protosw))); } #endif #define DEFAULT(foo, bar) if ((foo) == NULL) (foo) = (bar) DEFAULT(pu->pru_accept, pru_accept_notsupp); DEFAULT(pu->pru_aio_queue, pru_aio_queue_notsupp); DEFAULT(pu->pru_bind, pru_bind_notsupp); DEFAULT(pu->pru_bindat, pru_bindat_notsupp); DEFAULT(pu->pru_connect, pru_connect_notsupp); DEFAULT(pu->pru_connect2, pru_connect2_notsupp); DEFAULT(pu->pru_connectat, pru_connectat_notsupp); DEFAULT(pu->pru_control, pru_control_notsupp); DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp); DEFAULT(pu->pru_listen, pru_listen_notsupp); DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp); DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp); DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp); DEFAULT(pu->pru_sense, pru_sense_null); DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp); DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp); DEFAULT(pu->pru_sosend, sosend_generic); DEFAULT(pu->pru_soreceive, soreceive_generic); DEFAULT(pu->pru_sopoll, sopoll_generic); DEFAULT(pu->pru_ready, pru_ready_notsupp); #undef DEFAULT } /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. */ void domain_init(void *arg) { struct domain *dp = arg; struct protosw *pr; int flags; + MPASS(IS_DEFAULT_VNET(curvnet)); + flags = atomic_load_acq_int(&dp->dom_flags); if ((flags & DOMF_SUPPORTED) == 0) return; - KASSERT((flags & DOMF_INITED) == 0 || !IS_DEFAULT_VNET(curvnet), - ("Premature initialization of domain in non-default vnet")); + MPASS((flags & DOMF_INITED) == 0); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { - /* - * Note that with VIMAGE enabled, domain_init() will be - * re-invoked for each new vnet that's created. The below lists - * are intended to be system-wide, so avoid altering global - * state for non-default vnets. - */ - if (IS_DEFAULT_VNET(curvnet)) { - pr_usrreqs_init(pr); - rm_wlock(&pftimo_lock); - if (pr->pr_fasttimo != NULL) - LIST_INSERT_HEAD(&pffast_list, pr, - pr_fasttimos); - if (pr->pr_slowtimo != NULL) - LIST_INSERT_HEAD(&pfslow_list, pr, - pr_slowtimos); - rm_wunlock(&pftimo_lock); - } + pr_usrreqs_init(pr); + rm_wlock(&pftimo_lock); + if (pr->pr_fasttimo != NULL) + LIST_INSERT_HEAD(&pffast_list, pr, pr_fasttimos); + if (pr->pr_slowtimo != NULL) + LIST_INSERT_HEAD(&pfslow_list, pr, pr_slowtimos); + rm_wunlock(&pftimo_lock); } /* * update global information about maximums */ max_hdr = max_linkhdr + max_protohdr; max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); - if (IS_DEFAULT_VNET(curvnet)) - atomic_set_rel_int(&dp->dom_flags, DOMF_INITED); -} - -#ifdef VIMAGE -void -vnet_domain_init(void *arg) -{ - - /* Virtualized case is no different -- call init functions. */ - domain_init(arg); + atomic_set_rel_int(&dp->dom_flags, DOMF_INITED); } -void -vnet_domain_uninit(void *arg) -{ - struct domain *dp = arg; - - if ((atomic_load_acq_int(&dp->dom_flags) & DOMF_SUPPORTED) == 0) - return; -} -#endif - /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. */ void domain_add(void *data) { struct domain *dp; dp = (struct domain *)data; if (dp->dom_probe != NULL && (*dp->dom_probe)() != 0) return; atomic_set_rel_int(&dp->dom_flags, DOMF_SUPPORTED); mtx_lock(&dom_mtx); dp->dom_next = domains; domains = dp; KASSERT(domain_init_status >= 1, ("attempt to domain_add(%s) before domaininit()", dp->dom_name)); #ifndef INVARIANTS if (domain_init_status < 1) printf("WARNING: attempt to domain_add(%s) before " "domaininit()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); } /* ARGSUSED*/ static void domaininit(void *dummy) { if (max_linkhdr < 16) /* XXX */ max_linkhdr = 16; callout_init(&pffast_callout, 1); callout_init(&pfslow_callout, 1); mtx_lock(&dom_mtx); KASSERT(domain_init_status == 0, ("domaininit called too late!")); domain_init_status = 1; mtx_unlock(&dom_mtx); } /* ARGSUSED*/ static void domainfinalize(void *dummy) { mtx_lock(&dom_mtx); KASSERT(domain_init_status == 1, ("domainfinalize called too late!")); domain_init_status = 2; mtx_unlock(&dom_mtx); callout_reset(&pffast_callout, 1, pffasttimo, NULL); callout_reset(&pfslow_callout, 1, pfslowtimo, NULL); } struct domain * pffinddomain(int family) { struct domain *dp; for (dp = domains; dp != NULL; dp = dp->dom_next) if (dp->dom_family == family) return (dp); return (NULL); } struct protosw * pffindtype(int family, int type) { struct domain *dp; struct protosw *pr; dp = pffinddomain(family); if (dp == NULL) return (NULL); for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_type && pr->pr_type == type) return (pr); return (NULL); } struct protosw * pffindproto(int family, int protocol, int type) { struct domain *dp; struct protosw *pr; struct protosw *maybe; maybe = NULL; if (family == 0) return (NULL); dp = pffinddomain(family); if (dp == NULL) return (NULL); for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) return (pr); if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && pr->pr_protocol == 0 && maybe == NULL) maybe = pr; } return (maybe); } /* * The caller must make sure that the new protocol is fully set up and ready to * accept requests before it is registered. */ int pf_proto_register(int family, struct protosw *npr) { struct domain *dp; struct protosw *pr, *fpr; /* Sanity checks. */ if (family == 0) return (EPFNOSUPPORT); if (npr->pr_type == 0) return (EPROTOTYPE); if (npr->pr_protocol == 0) return (EPROTONOSUPPORT); if (npr->pr_usrreqs == NULL) return (ENXIO); /* Try to find the specified domain based on the family. */ dp = pffinddomain(family); if (dp == NULL) return (EPFNOSUPPORT); /* Initialize backpointer to struct domain. */ npr->pr_domain = dp; fpr = NULL; /* * Protect us against races when two protocol registrations for * the same protocol happen at the same time. */ mtx_lock(&dom_mtx); /* The new protocol must not yet exist. */ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_type == npr->pr_type) && (pr->pr_protocol == npr->pr_protocol)) { mtx_unlock(&dom_mtx); return (EEXIST); /* XXX: Check only protocol? */ } /* While here, remember the first free spacer. */ if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER)) fpr = pr; } /* If no free spacer is found we can't add the new protocol. */ if (fpr == NULL) { mtx_unlock(&dom_mtx); return (ENOMEM); } /* Copy the new struct protosw over the spacer. */ bcopy(npr, fpr, sizeof(*fpr)); pr_usrreqs_init(fpr); rm_wlock(&pftimo_lock); if (fpr->pr_fasttimo != NULL) LIST_INSERT_HEAD(&pffast_list, fpr, pr_fasttimos); if (fpr->pr_slowtimo != NULL) LIST_INSERT_HEAD(&pfslow_list, fpr, pr_slowtimos); rm_wunlock(&pftimo_lock); /* Job is done, no more protection required. */ mtx_unlock(&dom_mtx); return (0); } /* * The caller must make sure the protocol and its functions correctly shut down * all sockets and release all locks and memory references. */ int pf_proto_unregister(int family, int protocol, int type) { struct domain *dp; struct protosw *pr, *dpr; /* Sanity checks. */ if (family == 0) return (EPFNOSUPPORT); if (protocol == 0) return (EPROTONOSUPPORT); if (type == 0) return (EPROTOTYPE); /* Try to find the specified domain based on the family type. */ dp = pffinddomain(family); if (dp == NULL) return (EPFNOSUPPORT); dpr = NULL; /* Lock out everyone else while we are manipulating the protosw. */ mtx_lock(&dom_mtx); /* The protocol must exist and only once. */ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) { if (dpr != NULL) { mtx_unlock(&dom_mtx); return (EMLINK); /* Should not happen! */ } else dpr = pr; } } /* Protocol does not exist. */ if (dpr == NULL) { mtx_unlock(&dom_mtx); return (EPROTONOSUPPORT); } rm_wlock(&pftimo_lock); if (dpr->pr_fasttimo != NULL) LIST_REMOVE(dpr, pr_fasttimos); if (dpr->pr_slowtimo != NULL) LIST_REMOVE(dpr, pr_slowtimos); rm_wunlock(&pftimo_lock); /* De-orbit the protocol and make the slot available again. */ dpr->pr_type = 0; dpr->pr_domain = dp; dpr->pr_protocol = PROTO_SPACER; dpr->pr_flags = 0; dpr->pr_input = NULL; dpr->pr_output = NULL; dpr->pr_ctlinput = NULL; dpr->pr_ctloutput = NULL; dpr->pr_fasttimo = NULL; dpr->pr_slowtimo = NULL; dpr->pr_drain = NULL; dpr->pr_usrreqs = &nousrreqs; /* Job is done, not more protection required. */ mtx_unlock(&dom_mtx); return (0); } void pfctlinput(int cmd, struct sockaddr *sa) { struct domain *dp; struct protosw *pr; NET_EPOCH_ASSERT(); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_ctlinput) (*pr->pr_ctlinput)(cmd, sa, (void *)0); } static void pfslowtimo(void *arg) { struct rm_priotracker tracker; struct epoch_tracker et; struct protosw *pr; rm_rlock(&pftimo_lock, &tracker); NET_EPOCH_ENTER(et); LIST_FOREACH(pr, &pfslow_list, pr_slowtimos) { (*pr->pr_slowtimo)(); } NET_EPOCH_EXIT(et); rm_runlock(&pftimo_lock, &tracker); callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL); } static void pffasttimo(void *arg) { struct rm_priotracker tracker; struct epoch_tracker et; struct protosw *pr; rm_rlock(&pftimo_lock, &tracker); NET_EPOCH_ENTER(et); LIST_FOREACH(pr, &pffast_list, pr_fasttimos) { (*pr->pr_fasttimo)(); } NET_EPOCH_EXIT(et); rm_runlock(&pftimo_lock, &tracker); callout_reset(&pffast_callout, hz/5, pffasttimo, NULL); } diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 7c4ff01e5e73..9ab07b374ea2 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -1,2705 +1,2705 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_ddb.h" #include "opt_route.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; uint32_t _ifm_spare2; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #define SA_SIZE32(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(int) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) ) #endif /* COMPAT_FREEBSD32 */ struct linear_buffer { char *base; /* Base allocated memory pointer */ uint32_t offset; /* Currently used offset */ uint32_t size; /* Total buffer size */ }; #define SCRATCH_BUFFER_SIZE 1024 #define RTS_PID_PRINTF(_fmt, ...) \ printf("rtsock:%s(): PID %d: " _fmt "\n", __func__, curproc->p_pid, ## __VA_ARGS__) MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; VNET_DEFINE_STATIC(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int family; int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; struct sockaddr *dst; struct sockaddr *mask; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb); static int sysctl_dumpentry(struct rtentry *rt, void *vw); static int sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, struct walkarg *w); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, struct rt_msghdr *rtm, struct rib_cmd_info *rc); static int update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh); static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m, sa_family_t saf, u_int fibnum, int rtm_errno); static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host, const struct sockaddr *rt_dst); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void vnet_rts_init(void) { int tmp; if (IS_DEFAULT_VNET(curvnet)) { if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } #ifdef VIMAGE else netisr_register_vnet(&rtsock_nh); #endif } VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_init, 0); #ifdef VIMAGE static void vnet_rts_uninit(void) { netisr_unregister_vnet(&rtsock_nh); } VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred) { #if defined(INET) || defined(INET6) struct epoch_tracker et; #endif /* First, see if the returned address is part of the jail. */ if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } static int fill_blackholeinfo(struct rt_addrinfo *info, union sockaddr_union *saun) { struct ifaddr *ifa; sa_family_t saf; if (V_loif == NULL) { RTS_PID_PRINTF("Unable to add blackhole/reject nhop without loopback"); return (ENOTSUP); } info->rti_ifp = V_loif; saf = info->rti_info[RTAX_DST]->sa_family; CK_STAILQ_FOREACH(ifa, &info->rti_ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == saf) { info->rti_ifa = ifa; break; } } if (info->rti_ifa == NULL) return (ENOTSUP); bzero(saun, sizeof(union sockaddr_union)); switch (saf) { #ifdef INET case AF_INET: saun->sin.sin_family = AF_INET; saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; #endif #ifdef INET6 case AF_INET6: saun->sin6.sin6_family = AF_INET6; saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_addr = in6addr_loopback; break; #endif default: return (ENOTSUP); } info->rti_info[RTAX_GATEWAY] = &saun->sa; info->rti_flags |= RTF_GATEWAY; return (0); } /* * Fills in @info based on userland-provided @rtm message. * * Returns 0 on success. */ static int fill_addrinfo(struct rt_msghdr *rtm, int len, struct linear_buffer *lb, u_int fibnum, struct rt_addrinfo *info) { int error; rtm->rtm_pid = curproc->p_pid; info->rti_addrs = rtm->rtm_addrs; info->rti_mflags = rtm->rtm_inits; info->rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, info)) return (EINVAL); info->rti_flags = rtm->rtm_flags; error = cleanup_xaddrs(info, lb); if (error != 0) return (error); /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error != 0) return (error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info->rti_info[RTAX_GATEWAY] != NULL && info->rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; struct sockaddr_storage ss; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info->rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info->rti_flags &= ~RTF_GATEWAY; info->rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } return (0); } static struct nhop_object * select_nhop(struct nhop_object *nh, const struct sockaddr *gw) { if (!NH_IS_NHGRP(nh)) return (nh); #ifdef ROUTE_MPATH struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); if (gw == NULL) return (wn[0].nh); for (int i = 0; i < num_nhops; i++) { if (match_nhop_gw(wn[i].nh, gw)) return (wn[i].nh); } #endif return (NULL); } /* * Handles RTM_GET message from routing socket, returning matching rt. * * Returns: * 0 on success, with locked and referenced matching rt in @rt_nrt * errno of failure */ static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, struct rt_msghdr *rtm, struct rib_cmd_info *rc) { RIB_RLOCK_TRACKER; struct rib_head *rnh; struct nhop_object *nh; sa_family_t saf; saf = info->rti_info[RTAX_DST]->sa_family; rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); /* * By (implicit) convention host route (one without netmask) * means longest-prefix-match request and the route with netmask * means exact-match lookup. * As cleanup_xaddrs() cleans up info flags&addrs for the /32,/128 * prefixes, use original data to check for the netmask presence. */ if ((rtm->rtm_addrs & RTA_NETMASK) == 0) { /* * Provide longest prefix match for * address lookup (no mask). * 'route -n get addr' */ rc->rc_rt = (struct rtentry *) rnh->rnh_matchaddr( info->rti_info[RTAX_DST], &rnh->head); } else rc->rc_rt = (struct rtentry *) rnh->rnh_lookup( info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rc->rc_rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]); if (nh == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. * TODO: move this logic to userland. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (nh->nh_ifp != NULL && nh->nh_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; ifa = ifa_ifwithnet(info->rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); } else rt_maskedcopy(nh->nh_ifa->ifa_addr, &laddr, nh->nh_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rc->rc_rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rc->rc_rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]); if (nh == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } } rc->rc_nh_new = nh; rc->rc_nh_weight = rc->rc_rt->rt_weight; RIB_RUNLOCK(rnh); return (0); } static void init_sockaddrs_family(int family, struct sockaddr *dst, struct sockaddr *mask) { #ifdef INET if (family == AF_INET) { struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *mask4 = (struct sockaddr_in *)mask; bzero(dst4, sizeof(struct sockaddr_in)); bzero(mask4, sizeof(struct sockaddr_in)); dst4->sin_family = AF_INET; dst4->sin_len = sizeof(struct sockaddr_in); mask4->sin_family = AF_INET; mask4->sin_len = sizeof(struct sockaddr_in); } #endif #ifdef INET6 if (family == AF_INET6) { struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask; bzero(dst6, sizeof(struct sockaddr_in6)); bzero(mask6, sizeof(struct sockaddr_in6)); dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(struct sockaddr_in6); mask6->sin6_family = AF_INET6; mask6->sin6_len = sizeof(struct sockaddr_in6); } #endif } static void export_rtaddrs(const struct rtentry *rt, struct sockaddr *dst, struct sockaddr *mask) { #ifdef INET if (dst->sa_family == AF_INET) { struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *mask4 = (struct sockaddr_in *)mask; uint32_t scopeid = 0; rt_get_inet_prefix_pmask(rt, &dst4->sin_addr, &mask4->sin_addr, &scopeid); return; } #endif #ifdef INET6 if (dst->sa_family == AF_INET6) { struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask; uint32_t scopeid = 0; rt_get_inet6_prefix_pmask(rt, &dst6->sin6_addr, &mask6->sin6_addr, &scopeid); dst6->sin6_scope_id = scopeid; return; } #endif } static int update_rtm_from_info(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len) { struct rt_msghdr *rtm, *orig_rtm = NULL; struct walkarg w; int len; rtm = *prtm; /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *tmp_rtm; tmp_rtm = malloc(len, M_TEMP, M_NOWAIT); if (tmp_rtm == NULL) return (ENOBUFS); bcopy(rtm, tmp_rtm, rtm->rtm_msglen); orig_rtm = rtm; rtm = tmp_rtm; alloc_len = len; /* * Delay freeing original rtm as info contains * data referencing it. */ } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, info, &w, &len); rtm->rtm_addrs = info->rti_addrs; if (orig_rtm != NULL) free(orig_rtm, M_TEMP); *prtm = rtm; return (0); } /* * Update sockaddrs, flags, etc in @prtm based on @rc data. * rtm can be reallocated. * * Returns 0 on success, along with pointer to (potentially reallocated) * rtm. * */ static int update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh) { union sockaddr_union saun; struct rt_msghdr *rtm; struct ifnet *ifp; int error; rtm = *prtm; union sockaddr_union sa_dst, sa_mask; int family = info->rti_info[RTAX_DST]->sa_family; init_sockaddrs_family(family, &sa_dst.sa, &sa_mask.sa); export_rtaddrs(rc->rc_rt, &sa_dst.sa, &sa_mask.sa); info->rti_info[RTAX_DST] = &sa_dst.sa; info->rti_info[RTAX_NETMASK] = rt_is_host(rc->rc_rt) ? NULL : &sa_mask.sa; info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; info->rti_info[RTAX_GENMASK] = 0; ifp = nh->nh_ifp; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { if (ifp) { info->rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(info, ifp, nh, &saun, curthread->td_ucred); if (error != 0) return (error); if (ifp->if_flags & IFF_POINTOPOINT) info->rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info->rti_info[RTAX_IFP] = NULL; info->rti_info[RTAX_IFA] = NULL; } } else if (ifp != NULL) rtm->rtm_index = ifp->if_index; if ((error = update_rtm_from_info(info, prtm, alloc_len)) != 0) return (error); rtm = *prtm; rtm->rtm_flags = rc->rc_rt->rte_flags | nhop_get_rtflags(nh); if (rtm->rtm_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rtm->rtm_flags & ~RTF_GWFLAG_COMPAT); rt_getmetrics(rc->rc_rt, nh, &rtm->rtm_rmx); rtm->rtm_rmx.rmx_weight = rc->rc_nh_weight; return (0); } #ifdef ROUTE_MPATH static void save_del_notification(struct rib_cmd_info *rc, void *_cbdata) { struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; if (rc->rc_cmd == RTM_DELETE) *rc_new = *rc; } static void save_add_notification(struct rib_cmd_info *rc, void *_cbdata) { struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; if (rc->rc_cmd == RTM_ADD) *rc_new = *rc; } #endif #if defined(INET6) || defined(INET) static struct sockaddr * alloc_sockaddr_aligned(struct linear_buffer *lb, int len) { len = roundup2(len, sizeof(uint64_t)); if (lb->offset + len > lb->size) return (NULL); struct sockaddr *sa = (struct sockaddr *)(lb->base + lb->offset); lb->offset += len; return (sa); } #endif /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rt_addrinfo info; struct epoch_tracker et; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; sa_family_t saf = AF_UNSPEC; struct rib_cmd_info rc; struct nhop_object *nh; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); NET_EPOCH_ENTER(et); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); int total_len = alloc_len + SCRATCH_BUFFER_SIZE; if ((rtm = malloc(total_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); nh = NULL; struct linear_buffer lb = { .base = (char *)rtm + alloc_len, .size = SCRATCH_BUFFER_SIZE, }; if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ if ((error = fill_addrinfo(rtm, len, &lb, fibnum, &info)) != 0) { senderr(error); } /* fill_addringo() embeds scope into IPv6 addresses */ #ifdef INET6 rti_need_deembed = 1; #endif saf = info.rti_info[RTAX_DST]->sa_family; /* support for new ARP code */ if (rtm->rtm_flags & RTF_LLDATA) { error = lla_rt_output(rtm, &info); goto flush; } union sockaddr_union gw_saun; int blackhole_flags = rtm->rtm_flags & (RTF_BLACKHOLE|RTF_REJECT); if (blackhole_flags != 0) { if (blackhole_flags != (RTF_BLACKHOLE | RTF_REJECT)) error = fill_blackholeinfo(&info, &gw_saun); else error = EINVAL; if (error != 0) senderr(error); } switch (rtm->rtm_type) { case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); } error = rib_action(fibnum, rtm->rtm_type, &info, &rc); if (error == 0) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_new) || (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { struct rib_cmd_info rc_simple = {}; rib_decompose_notification(&rc, save_add_notification, (void *)&rc_simple); rc = rc_simple; } #endif nh = rc.rc_nh_new; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_flags = rc.rc_rt->rte_flags | nhop_get_rtflags(nh); } break; case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_old) || (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { struct rib_cmd_info rc_simple = {}; rib_decompose_notification(&rc, save_del_notification, (void *)&rc_simple); rc = rc_simple; } #endif nh = rc.rc_nh_old; } break; case RTM_GET: error = handle_rtm_get(&info, fibnum, rtm, &rc); if (error != 0) senderr(error); nh = rc.rc_nh_new; if (!can_export_rte(curthread->td_ucred, info.rti_info[RTAX_NETMASK] == NULL, info.rti_info[RTAX_DST])) { senderr(ESRCH); } break; default: senderr(EOPNOTSUPP); } if (error == 0) { error = update_rtm_from_rc(&info, &rtm, alloc_len, &rc, nh); /* * Note that some sockaddr pointers may have changed to * point to memory outsize @rtm. Some may be pointing * to the on-stack variables. * Given that, any pointer in @info CANNOT BE USED. */ /* * scopeid deembedding has been performed while * writing updated rtm in rtsock_msg_buffer(). * With that in mind, skip deembedding procedure below. */ #ifdef INET6 rti_need_deembed = 0; #endif } flush: NET_EPOCH_EXIT(et); #ifdef INET6 if (rtm != NULL) { if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } if (update_rtm_from_info(&info, &rtm, alloc_len) != 0) { if (error != 0) error = ENOBUFS; } } } #endif send_rtm_reply(so, rtm, m, saf, fibnum, error); return (error); } /* * Sends the prepared reply message in @rtm to all rtsock clients. * Frees @m and @rtm. * */ static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m, sa_family_t saf, u_int fibnum, int rtm_errno) { struct rawcb *rp = NULL; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return; } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { if (rtm_errno!= 0) rtm->rtm_errno = rtm_errno; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } } static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = nh->nh_mtu; out->rmx_weight = rt->rt_weight; out->rmx_nhidx = nhop_get_idx(nh); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } #ifdef INET static inline void fill_sockaddr_inet(struct sockaddr_in *sin, struct in_addr addr) { const struct sockaddr_in nsin = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr = addr, }; *sin = nsin; } #endif #ifdef INET6 static inline void fill_sockaddr_inet6(struct sockaddr_in6 *sin6, const struct in6_addr *addr6, uint32_t scopeid) { const struct sockaddr_in6 nsin6 = { .sin6_family = AF_INET6, .sin6_len = sizeof(struct sockaddr_in6), .sin6_addr = *addr6, .sin6_scope_id = scopeid, }; *sin6 = nsin6; } #endif #if defined(INET6) || defined(INET) /* * Checks if gateway is suitable for lltable operations. * Lltable code requires AF_LINK gateway with ifindex * and mac address specified. * Returns 0 on success. */ static int cleanup_xaddrs_lladdr(struct rt_addrinfo *info) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY]; if (sdl->sdl_family != AF_LINK) return (EINVAL); if (sdl->sdl_index == 0) return (EINVAL); if (offsetof(struct sockaddr_dl, sdl_data) + sdl->sdl_nlen + sdl->sdl_alen > sdl->sdl_len) return (EINVAL); return (0); } static int cleanup_xaddrs_gateway(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; struct sockaddr *sa; if (info->rti_flags & RTF_LLDATA) return (cleanup_xaddrs_lladdr(info)); switch (gw->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *gw_sin = (struct sockaddr_in *)gw; /* Ensure reads do not go beyoud SA boundary */ if (SA_SIZE(gw) < offsetof(struct sockaddr_in, sin_zero)) { RTS_PID_PRINTF("gateway sin_len too small: %d", gw->sa_len); return (EINVAL); } sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_in)); if (sa == NULL) return (ENOBUFS); fill_sockaddr_inet((struct sockaddr_in *)sa, gw_sin->sin_addr); info->rti_info[RTAX_GATEWAY] = sa; } break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *gw_sin6 = (struct sockaddr_in6 *)gw; if (gw_sin6->sin6_len < sizeof(struct sockaddr_in6)) { RTS_PID_PRINTF("gateway sin6_len too small: %d", gw->sa_len); return (EINVAL); } fill_sockaddr_inet6(gw_sin6, &gw_sin6->sin6_addr, 0); break; } #endif case AF_LINK: { struct sockaddr_dl *gw_sdl; size_t sdl_min_len = offsetof(struct sockaddr_dl, sdl_data); gw_sdl = (struct sockaddr_dl *)gw; if (gw_sdl->sdl_len < sdl_min_len) { RTS_PID_PRINTF("gateway sdl_len too small: %d", gw_sdl->sdl_len); return (EINVAL); } sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_dl_short)); if (sa == NULL) return (ENOBUFS); const struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_index = gw_sdl->sdl_index, }; *((struct sockaddr_dl_short *)sa) = sdl; info->rti_info[RTAX_GATEWAY] = sa; break; } } return (0); } #endif static void remove_netmask(struct rt_addrinfo *info) { info->rti_info[RTAX_NETMASK] = NULL; info->rti_flags |= RTF_HOST; info->rti_addrs &= ~RTA_NETMASK; } #ifdef INET static int cleanup_xaddrs_inet(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr_in *dst_sa, *mask_sa; const int sa_len = sizeof(struct sockaddr_in); struct in_addr dst, mask; /* Check & fixup dst/netmask combination first */ dst_sa = (struct sockaddr_in *)info->rti_info[RTAX_DST]; mask_sa = (struct sockaddr_in *)info->rti_info[RTAX_NETMASK]; /* Ensure reads do not go beyound the buffer size */ if (SA_SIZE(dst_sa) < offsetof(struct sockaddr_in, sin_zero)) return (EINVAL); if ((mask_sa != NULL) && mask_sa->sin_len < sizeof(struct sockaddr_in)) { /* * Some older routing software encode mask length into the * sin_len, thus resulting in "truncated" sockaddr. */ int len = mask_sa->sin_len - offsetof(struct sockaddr_in, sin_addr); if (len >= 0) { mask.s_addr = 0; if (len > sizeof(struct in_addr)) len = sizeof(struct in_addr); memcpy(&mask, &mask_sa->sin_addr, len); } else { RTS_PID_PRINTF("prefix mask sin_len too small: %d", mask_sa->sin_len); return (EINVAL); } } else mask.s_addr = mask_sa ? mask_sa->sin_addr.s_addr : INADDR_BROADCAST; dst.s_addr = htonl(ntohl(dst_sa->sin_addr.s_addr) & ntohl(mask.s_addr)); /* Construct new "clean" dst/mask sockaddresses */ if ((dst_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet(dst_sa, dst); info->rti_info[RTAX_DST] = (struct sockaddr *)dst_sa; if (mask.s_addr != INADDR_BROADCAST) { if ((mask_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet(mask_sa, mask); info->rti_info[RTAX_NETMASK] = (struct sockaddr *)mask_sa; info->rti_flags &= ~RTF_HOST; } else remove_netmask(info); /* Check gateway */ if (info->rti_info[RTAX_GATEWAY] != NULL) return (cleanup_xaddrs_gateway(info, lb)); return (0); } #endif #ifdef INET6 static int cleanup_xaddrs_inet6(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr *sa; struct sockaddr_in6 *dst_sa, *mask_sa; struct in6_addr mask, *dst; const int sa_len = sizeof(struct sockaddr_in6); /* Check & fixup dst/netmask combination first */ dst_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_DST]; mask_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_NETMASK]; if (dst_sa->sin6_len < sizeof(struct sockaddr_in6)) { RTS_PID_PRINTF("prefix dst sin6_len too small: %d", dst_sa->sin6_len); return (EINVAL); } if (mask_sa && mask_sa->sin6_len < sizeof(struct sockaddr_in6)) { /* * Some older routing software encode mask length into the * sin6_len, thus resulting in "truncated" sockaddr. */ int len = mask_sa->sin6_len - offsetof(struct sockaddr_in6, sin6_addr); if (len >= 0) { bzero(&mask, sizeof(mask)); if (len > sizeof(struct in6_addr)) len = sizeof(struct in6_addr); memcpy(&mask, &mask_sa->sin6_addr, len); } else { RTS_PID_PRINTF("rtsock: prefix mask sin6_len too small: %d", mask_sa->sin6_len); return (EINVAL); } } else mask = mask_sa ? mask_sa->sin6_addr : in6mask128; dst = &dst_sa->sin6_addr; IN6_MASK_ADDR(dst, &mask); if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet6((struct sockaddr_in6 *)sa, dst, 0); info->rti_info[RTAX_DST] = sa; if (!IN6_ARE_ADDR_EQUAL(&mask, &in6mask128)) { if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet6((struct sockaddr_in6 *)sa, &mask, 0); info->rti_info[RTAX_NETMASK] = sa; info->rti_flags &= ~RTF_HOST; } else remove_netmask(info); /* Check gateway */ if (info->rti_info[RTAX_GATEWAY] != NULL) return (cleanup_xaddrs_gateway(info, lb)); return (0); } #endif static int cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb) { int error = EAFNOSUPPORT; if (info->rti_info[RTAX_DST] == NULL) return (EINVAL); if (info->rti_flags & RTF_LLDATA) { /* * arp(8)/ndp(8) sends RTA_NETMASK for the associated * prefix along with the actual address in RTA_DST. * Remove netmask to avoid unnecessary address masking. */ remove_netmask(info); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: error = cleanup_xaddrs_inet(info, lb); break; #endif #ifdef INET6 case AF_INET6: error = cleanup_xaddrs_inet6(info, lb); break; #endif } return (error); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ struct sockaddr * rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct sockaddr_storage ss; struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); KASSERT(dlen <= sizeof(ss), ("%s: sockaddr size overflow", __func__)); bzero(&ss, sizeof(ss)); bcopy(sa, &ss, sa->sa_len); sa = (struct sockaddr *)&ss; #ifdef INET6 if (sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)sa; (void)sa6_recoverscope(sin6); } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { struct sockaddr_storage ss; int len, buflen = 0, dlen, i; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_in6 *sin6; #endif #ifdef COMPAT_FREEBSD32 bool compat32 = false; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { len = sizeof(struct ifa_msghdrl32); compat32 = true; } else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); compat32 = true; break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); #ifdef COMPAT_FREEBSD32 if (compat32) dlen = SA_SIZE32(sa); else #endif dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { KASSERT(dlen <= sizeof(ss), ("%s: sockaddr size overflow", __func__)); bzero(&ss, sizeof(ss)); bcopy(sa, &ss, sa->sa_len); sa = (struct sockaddr *)&ss; #ifdef INET6 if (sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)sa; (void)sa6_recoverscope(sin6); } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else { bzero(cp, dlen); cp += dlen; buflen -= dlen; } } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFA], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal to rtsock based on @rt data. * Callers are advives to use rt_routemsg() instead of using this * function directly. * Assume @rt data is consistent. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { union sockaddr_union dst, mask; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return (0); int family = rt_get_family(rt); init_sockaddrs_family(family, &dst.sa, &mask.sa); export_rtaddrs(rt, &dst.sa, &mask.sa); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = &dst.sa; info.rti_info[RTAX_NETMASK] = &mask.sa; info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; info.rti_flags = rt->rte_flags | nhop_get_rtflags(nh); info.rti_ifp = nh->nh_ifp; return (rtsock_routemsg_info(cmd, &info, fibnum)); } int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { struct rt_msghdr *rtm; struct sockaddr *sa; struct mbuf *m; if (V_route_cb.any_count == 0) return (0); if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; m = rtsock_msg_mbuf(cmd, info); if (m == NULL) return (ENOBUFS); if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_addrs = info->rti_addrs; if (info->rti_ifp != NULL) rtm->rtm_index = info->rti_ifp->if_index; /* Add RTF_DONE to indicate command 'completion' required by API */ info->rti_flags |= RTF_DONE; /* Reported routes has to be up */ if (cmd == RTM_ADD || cmd == RTM_CHANGE) info->rti_flags |= RTF_UP; rtm->rtm_flags = info->rti_flags; sa = info->rti_info[RTAX_DST]; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; if (ifp && ifp->if_addr) info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; else info.rti_info[RTAX_IFP] = NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * Checks if rte can be exported w.r.t jails/vnets. * * Returns true if it can, false otherwise. */ static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host, const struct sockaddr *rt_dst) { if ((!rt_is_host) ? jailed_without_vnet(td_ucred) : prison_if(td_ucred, rt_dst) != 0) return (false); return (true); } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct rtentry *rt, void *vw) { struct walkarg *w = vw; struct nhop_object *nh; int error = 0; NET_EPOCH_ASSERT(); export_rtaddrs(rt, w->dst, w->mask); if (!can_export_rte(w->w_req->td->td_ucred, rt_is_host(rt), w->dst)) return (0); nh = rt_get_raw_nhop(rt); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); for (int i = 0; i < num_nhops; i++) { error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w); if (error != 0) return (error); } } else #endif error = sysctl_dumpnhop(rt, nh, rt->rt_weight, w); return (0); } static int sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, struct walkarg *w) { struct rt_addrinfo info; int error = 0, size; uint32_t rtflags; rtflags = nhop_get_rtflags(nh); if (w->w_op == NET_RT_FLAGS && !(rtflags & w->w_arg)) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = w->dst; info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; info.rti_info[RTAX_NETMASK] = (rtflags & RTF_HOST) ? NULL : w->mask; info.rti_info[RTAX_GENMASK] = 0; if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) { info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; if (nh->nh_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; bzero(&rtm->rtm_index, sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index)); /* * rte flags may consist of RTF_HOST (duplicated in nhop rtflags) * and RTF_UP (if entry is linked, which is always true here). * Given that, use nhop rtflags & add RTF_UP. */ rtm->rtm_flags = rtflags | RTF_UP; if (rtm->rtm_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rtm->rtm_flags & ~RTF_GWFLAG_COMPAT); rt_getmetrics(rt, nh, &rtm->rtm_rmx); rtm->rtm_rmx.rmx_weight = weight; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifm32->_ifm_spare2 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifm->_ifm_spare2 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct if_data ifd; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &ifd, &info, w, len); else error = sysctl_iflist_ifm(ifp, &ifd, &info, w, len); if (error) goto done; } while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; int error, len; NET_EPOCH_ASSERT(); error = 0; bzero((caddr_t)&info, sizeof(info)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) break; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; ifmam->_ifmam_spare1 = 0; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error != 0) break; } } if (error != 0) break; } return (error); } static void rtable_sysctl_dump(uint32_t fibnum, int family, struct walkarg *w) { union sockaddr_union sa_dst, sa_mask; w->family = family; w->dst = (struct sockaddr *)&sa_dst; w->mask = (struct sockaddr *)&sa_mask; init_sockaddrs_family(family, w->dst, w->mask); rib_walk(fibnum, family, false, sysctl_dumpentry, w); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; if (namelen < 3) return (EINVAL); name++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); NET_EPOCH_ENTER(et); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { rtable_sysctl_dump(fib, i, &w); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_NHOP: case NET_RT_NHGRP: /* Allow dumping one specific af/fib at a time */ if (namelen < 4) { error = EINVAL; break; } fib = name[3]; if (fib < 0 || fib > rt_numfibs) { error = EINVAL; break; } rnh = rt_tables_get_rnh(fib, af); if (rnh == NULL) { error = EAFNOSUPPORT; break; } if (w.w_op == NET_RT_NHOP) error = nhops_dump_sysctl(rnh, w.w_req); else #ifdef ROUTE_MPATH error = nhgrp_dump_sysctl(rnh, w.w_req); #else error = ENOTSUP; #endif break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } NET_EPOCH_EXIT(et); free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_rtsock, "Return route tables and interface/address lists"); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[nitems(routesw)] }; -VNET_DOMAIN_SET(route); +DOMAIN_SET(route); diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket.c b/sys/netgraph/bluetooth/socket/ng_btsocket.c index 93866ce79d6c..416f07a670c6 100644 --- a/sys/netgraph/bluetooth/socket/ng_btsocket.c +++ b/sys/netgraph/bluetooth/socket/ng_btsocket.c @@ -1,290 +1,290 @@ /* * ng_btsocket.c */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001-2002 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: ng_btsocket.c,v 1.4 2003/09/14 23:29:06 max Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ng_btsocket_modevent (module_t, int, void *); static struct domain ng_btsocket_domain; /* * Bluetooth raw HCI sockets */ static struct pr_usrreqs ng_btsocket_hci_raw_usrreqs = { .pru_abort = ng_btsocket_hci_raw_abort, .pru_attach = ng_btsocket_hci_raw_attach, .pru_bind = ng_btsocket_hci_raw_bind, .pru_connect = ng_btsocket_hci_raw_connect, .pru_control = ng_btsocket_hci_raw_control, .pru_detach = ng_btsocket_hci_raw_detach, .pru_disconnect = ng_btsocket_hci_raw_disconnect, .pru_peeraddr = ng_btsocket_hci_raw_peeraddr, .pru_send = ng_btsocket_hci_raw_send, .pru_shutdown = NULL, .pru_sockaddr = ng_btsocket_hci_raw_sockaddr, .pru_close = ng_btsocket_hci_raw_close, }; /* * Bluetooth raw L2CAP sockets */ static struct pr_usrreqs ng_btsocket_l2cap_raw_usrreqs = { .pru_abort = ng_btsocket_l2cap_raw_abort, .pru_attach = ng_btsocket_l2cap_raw_attach, .pru_bind = ng_btsocket_l2cap_raw_bind, .pru_connect = ng_btsocket_l2cap_raw_connect, .pru_control = ng_btsocket_l2cap_raw_control, .pru_detach = ng_btsocket_l2cap_raw_detach, .pru_disconnect = ng_btsocket_l2cap_raw_disconnect, .pru_peeraddr = ng_btsocket_l2cap_raw_peeraddr, .pru_send = ng_btsocket_l2cap_raw_send, .pru_shutdown = NULL, .pru_sockaddr = ng_btsocket_l2cap_raw_sockaddr, .pru_close = ng_btsocket_l2cap_raw_close, }; /* * Bluetooth SEQPACKET L2CAP sockets */ static struct pr_usrreqs ng_btsocket_l2cap_usrreqs = { .pru_abort = ng_btsocket_l2cap_abort, .pru_accept = ng_btsocket_l2cap_accept, .pru_attach = ng_btsocket_l2cap_attach, .pru_bind = ng_btsocket_l2cap_bind, .pru_connect = ng_btsocket_l2cap_connect, .pru_control = ng_btsocket_l2cap_control, .pru_detach = ng_btsocket_l2cap_detach, .pru_disconnect = ng_btsocket_l2cap_disconnect, .pru_listen = ng_btsocket_l2cap_listen, .pru_peeraddr = ng_btsocket_l2cap_peeraddr, .pru_send = ng_btsocket_l2cap_send, .pru_shutdown = NULL, .pru_sockaddr = ng_btsocket_l2cap_sockaddr, .pru_close = ng_btsocket_l2cap_close, }; /* * Bluetooth STREAM RFCOMM sockets */ static struct pr_usrreqs ng_btsocket_rfcomm_usrreqs = { .pru_abort = ng_btsocket_rfcomm_abort, .pru_accept = ng_btsocket_rfcomm_accept, .pru_attach = ng_btsocket_rfcomm_attach, .pru_bind = ng_btsocket_rfcomm_bind, .pru_connect = ng_btsocket_rfcomm_connect, .pru_control = ng_btsocket_rfcomm_control, .pru_detach = ng_btsocket_rfcomm_detach, .pru_disconnect = ng_btsocket_rfcomm_disconnect, .pru_listen = ng_btsocket_rfcomm_listen, .pru_peeraddr = ng_btsocket_rfcomm_peeraddr, .pru_send = ng_btsocket_rfcomm_send, .pru_shutdown = NULL, .pru_sockaddr = ng_btsocket_rfcomm_sockaddr, .pru_close = ng_btsocket_rfcomm_close, }; /* * Bluetooth SEQPACKET SCO sockets */ static struct pr_usrreqs ng_btsocket_sco_usrreqs = { .pru_abort = ng_btsocket_sco_abort, .pru_accept = ng_btsocket_sco_accept, .pru_attach = ng_btsocket_sco_attach, .pru_bind = ng_btsocket_sco_bind, .pru_connect = ng_btsocket_sco_connect, .pru_control = ng_btsocket_sco_control, .pru_detach = ng_btsocket_sco_detach, .pru_disconnect = ng_btsocket_sco_disconnect, .pru_listen = ng_btsocket_sco_listen, .pru_peeraddr = ng_btsocket_sco_peeraddr, .pru_send = ng_btsocket_sco_send, .pru_shutdown = NULL, .pru_sockaddr = ng_btsocket_sco_sockaddr, .pru_close = ng_btsocket_sco_close, }; /* * Definitions of protocols supported in the BLUETOOTH domain */ static struct protosw ng_btsocket_protosw[] = { { .pr_type = SOCK_RAW, .pr_domain = &ng_btsocket_domain, .pr_protocol = BLUETOOTH_PROTO_HCI, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = ng_btsocket_hci_raw_ctloutput, .pr_usrreqs = &ng_btsocket_hci_raw_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &ng_btsocket_domain, .pr_protocol = BLUETOOTH_PROTO_L2CAP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_usrreqs = &ng_btsocket_l2cap_raw_usrreqs, }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &ng_btsocket_domain, .pr_protocol = BLUETOOTH_PROTO_L2CAP, .pr_flags = PR_ATOMIC|PR_CONNREQUIRED, .pr_ctloutput = ng_btsocket_l2cap_ctloutput, .pr_usrreqs = &ng_btsocket_l2cap_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &ng_btsocket_domain, .pr_protocol = BLUETOOTH_PROTO_RFCOMM, .pr_flags = PR_CONNREQUIRED, .pr_ctloutput = ng_btsocket_rfcomm_ctloutput, .pr_usrreqs = &ng_btsocket_rfcomm_usrreqs, }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &ng_btsocket_domain, .pr_protocol = BLUETOOTH_PROTO_SCO, .pr_flags = PR_ATOMIC|PR_CONNREQUIRED, .pr_ctloutput = ng_btsocket_sco_ctloutput, .pr_usrreqs = &ng_btsocket_sco_usrreqs, }, }; #define ng_btsocket_protosw_end \ &ng_btsocket_protosw[nitems(ng_btsocket_protosw)] /* * BLUETOOTH domain */ static struct domain ng_btsocket_domain = { .dom_family = AF_BLUETOOTH, .dom_name = "bluetooth", .dom_protosw = ng_btsocket_protosw, .dom_protoswNPROTOSW = ng_btsocket_protosw_end }; /* * Socket sysctl tree */ SYSCTL_NODE(_net_bluetooth_hci, OID_AUTO, sockets, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth HCI sockets family"); SYSCTL_NODE(_net_bluetooth_l2cap, OID_AUTO, sockets, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth L2CAP sockets family"); SYSCTL_NODE(_net_bluetooth_rfcomm, OID_AUTO, sockets, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth RFCOMM sockets family"); SYSCTL_NODE(_net_bluetooth_sco, OID_AUTO, sockets, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bluetooth SCO sockets family"); /* * Module */ static moduledata_t ng_btsocket_mod = { "ng_btsocket", ng_btsocket_modevent, NULL }; DECLARE_MODULE(ng_btsocket, ng_btsocket_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_VERSION(ng_btsocket, NG_BLUETOOTH_VERSION); MODULE_DEPEND(ng_btsocket, ng_bluetooth, NG_BLUETOOTH_VERSION, NG_BLUETOOTH_VERSION, NG_BLUETOOTH_VERSION); MODULE_DEPEND(ng_btsocket, netgraph, NG_ABI_VERSION, NG_ABI_VERSION, NG_ABI_VERSION); /* * Handle loading and unloading for this node type. * This is to handle auxiliary linkages (e.g protocol domain addition). */ static int ng_btsocket_modevent(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: break; case MOD_UNLOAD: /* XXX can't unload protocol domain yet */ error = EBUSY; break; default: error = EOPNOTSUPP; break; } return (error); } /* ng_btsocket_modevent */ -VNET_DOMAIN_SET(ng_btsocket_); +DOMAIN_SET(ng_btsocket_); diff --git a/sys/netgraph/ng_socket.c b/sys/netgraph/ng_socket.c index 05f37579aba1..d8a7d2f3701f 100644 --- a/sys/netgraph/ng_socket.c +++ b/sys/netgraph/ng_socket.c @@ -1,1234 +1,1234 @@ /* * ng_socket.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Julian Elischer * * $FreeBSD$ * $Whistle: ng_socket.c,v 1.28 1999/11/01 09:24:52 julian Exp $ */ /* * Netgraph socket nodes * * There are two types of netgraph sockets, control and data. * Control sockets have a netgraph node, but data sockets are * parasitic on control sockets, and have no node of their own. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_PATH, "netgraph_path", "netgraph path info"); static MALLOC_DEFINE(M_NETGRAPH_SOCK, "netgraph_sock", "netgraph socket info"); #else #define M_NETGRAPH_PATH M_NETGRAPH #define M_NETGRAPH_SOCK M_NETGRAPH #endif /* * It's Ascii-art time! * +-------------+ +-------------+ * |socket (ctl)| |socket (data)| * +-------------+ +-------------+ * ^ ^ * | | * v v * +-----------+ +-----------+ * |pcb (ctl)| |pcb (data)| * +-----------+ +-----------+ * ^ ^ * | | * v v * +--------------------------+ * | Socket type private | * | data | * +--------------------------+ * ^ * | * v * +----------------+ * | struct ng_node | * +----------------+ */ /* Netgraph node methods */ static ng_constructor_t ngs_constructor; static ng_rcvmsg_t ngs_rcvmsg; static ng_shutdown_t ngs_shutdown; static ng_newhook_t ngs_newhook; static ng_connect_t ngs_connect; static ng_findhook_t ngs_findhook; static ng_rcvdata_t ngs_rcvdata; static ng_disconnect_t ngs_disconnect; /* Internal methods */ static int ng_attach_data(struct socket *so); static int ng_attach_cntl(struct socket *so); static int ng_attach_common(struct socket *so, int type); static void ng_detach_common(struct ngpcb *pcbp, int type); static void ng_socket_free_priv(struct ngsock *priv); static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp); static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp); static int ngs_mod_event(module_t mod, int event, void *data); static void ng_socket_item_applied(void *context, int error); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_SOCKET_NODE_TYPE, .mod_event = ngs_mod_event, .constructor = ngs_constructor, .rcvmsg = ngs_rcvmsg, .shutdown = ngs_shutdown, .newhook = ngs_newhook, .connect = ngs_connect, .findhook = ngs_findhook, .rcvdata = ngs_rcvdata, .disconnect = ngs_disconnect, }; NETGRAPH_INIT_ORDERED(socket, &typestruct, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); /* Buffer space */ static u_long ngpdg_sendspace = 20 * 1024; /* really max datagram size */ SYSCTL_ULONG(_net_graph, OID_AUTO, maxdgram, CTLFLAG_RW, &ngpdg_sendspace , 0, "Maximum outgoing Netgraph datagram size"); static u_long ngpdg_recvspace = 20 * 1024; SYSCTL_ULONG(_net_graph, OID_AUTO, recvspace, CTLFLAG_RW, &ngpdg_recvspace , 0, "Maximum space for incoming Netgraph datagrams"); /* List of all sockets (for netstat -f netgraph) */ static LIST_HEAD(, ngpcb) ngsocklist; static struct mtx ngsocketlist_mtx; #define sotongpcb(so) ((struct ngpcb *)(so)->so_pcb) /* If getting unexplained errors returned, set this to "kdb_enter("X"); */ #ifndef TRAP_ERROR #define TRAP_ERROR #endif struct hookpriv { LIST_ENTRY(hookpriv) next; hook_p hook; }; LIST_HEAD(ngshash, hookpriv); /* Per-node private data */ struct ngsock { struct ng_node *node; /* the associated netgraph node */ struct ngpcb *datasock; /* optional data socket */ struct ngpcb *ctlsock; /* optional control socket */ struct ngshash *hash; /* hash for hook names */ u_long hmask; /* hash mask */ int flags; int refs; struct mtx mtx; /* mtx to wait on */ int error; /* place to store error */ }; #define NGS_FLAG_NOLINGER 1 /* close with last hook */ /*************************************************************** Control sockets ***************************************************************/ static int ngc_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); int error; error = priv_check(td, PRIV_NETGRAPH_CONTROL); if (error) return (error); if (pcbp != NULL) return (EISCONN); return (ng_attach_cntl(so)); } static void ngc_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); KASSERT(pcbp != NULL, ("ngc_detach: pcbp == NULL")); ng_detach_common(pcbp, NG_CONTROL); } static int ngc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); struct ngsock *const priv = NG_NODE_PRIVATE(pcbp->sockdata->node); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; struct ng_mesg *msg; struct mbuf *m0; item_p item; char *path = NULL; int len, error = 0; struct ng_apply_info apply; if (control) { error = EINVAL; goto release; } /* Require destination as there may be >= 1 hooks on this node. */ if (addr == NULL) { error = EDESTADDRREQ; goto release; } if (sap->sg_len > NG_NODESIZ + offsetof(struct sockaddr_ng, sg_data)) { error = EINVAL; goto release; } /* * Allocate an expendable buffer for the path, chop off * the sockaddr header, and make sure it's NUL terminated. */ len = sap->sg_len - offsetof(struct sockaddr_ng, sg_data); path = malloc(len + 1, M_NETGRAPH_PATH, M_WAITOK); bcopy(sap->sg_data, path, len); path[len] = '\0'; /* * Move the actual message out of mbufs into a linear buffer. * Start by adding up the size of the data. (could use mh_len?) */ for (len = 0, m0 = m; m0 != NULL; m0 = m0->m_next) len += m0->m_len; /* * Move the data into a linear buffer as well. * Messages are not delivered in mbufs. */ msg = malloc(len + 1, M_NETGRAPH_MSG, M_WAITOK); m_copydata(m, 0, len, (char *)msg); if (msg->header.version != NG_VERSION) { free(msg, M_NETGRAPH_MSG); error = EINVAL; goto release; } /* * Hack alert! * We look into the message and if it mkpeers a node of unknown type, we * try to load it. We need to do this now, in syscall thread, because if * message gets queued and applied later we will get panic. */ if (msg->header.typecookie == NGM_GENERIC_COOKIE && msg->header.cmd == NGM_MKPEER) { struct ngm_mkpeer *const mkp = (struct ngm_mkpeer *) msg->data; if (ng_findtype(mkp->type) == NULL) { char filename[NG_TYPESIZ + 3]; int fileid; /* Not found, try to load it as a loadable module. */ snprintf(filename, sizeof(filename), "ng_%s", mkp->type); error = kern_kldload(curthread, filename, &fileid); if (error != 0) { free(msg, M_NETGRAPH_MSG); goto release; } /* See if type has been loaded successfully. */ if (ng_findtype(mkp->type) == NULL) { free(msg, M_NETGRAPH_MSG); (void)kern_kldunload(curthread, fileid, LINKER_UNLOAD_NORMAL); error = ENXIO; goto release; } } } item = ng_package_msg(msg, NG_WAITOK); if ((error = ng_address_path((pcbp->sockdata->node), item, path, 0)) != 0) { #ifdef TRACE_MESSAGES printf("ng_address_path: errx=%d\n", error); #endif goto release; } #ifdef TRACE_MESSAGES printf("[%x]:<---------[socket]: c=<%d>cmd=%x(%s) f=%x #%d (%s)\n", item->el_dest->nd_ID, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token, item->el_dest->nd_type->name); #endif SAVE_LINE(item); /* * We do not want to return from syscall until the item * is processed by destination node. We register callback * on the item, which will update priv->error when item * was applied. * If ng_snd_item() has queued item, we sleep until * callback wakes us up. */ bzero(&apply, sizeof(apply)); apply.apply = ng_socket_item_applied; apply.context = priv; item->apply = &apply; priv->error = -1; error = ng_snd_item(item, 0); mtx_lock(&priv->mtx); if (priv->error == -1) msleep(priv, &priv->mtx, 0, "ngsock", 0); mtx_unlock(&priv->mtx); KASSERT(priv->error != -1, ("ng_socket: priv->error wasn't updated")); error = priv->error; release: if (path != NULL) free(path, M_NETGRAPH_PATH); if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == NULL) return (EINVAL); return (ng_bind(nam, pcbp)); } static int ngc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { /* * At this time refuse to do this.. it used to * do something but it was undocumented and not used. */ printf("program tried to connect control socket to remote node\n"); return (EINVAL); } /*************************************************************** Data sockets ***************************************************************/ static int ngd_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp != NULL) return (EISCONN); return (ng_attach_data(so)); } static void ngd_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); KASSERT(pcbp != NULL, ("ngd_detach: pcbp == NULL")); ng_detach_common(pcbp, NG_DATA); } static int ngd_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct epoch_tracker et; struct ngpcb *const pcbp = sotongpcb(so); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; int len, error; hook_p hook = NULL; item_p item; char hookname[NG_HOOKSIZ]; if ((pcbp == NULL) || (control != NULL)) { error = EINVAL; goto release; } if (pcbp->sockdata == NULL) { error = ENOTCONN; goto release; } if (sap == NULL) { len = 0; /* Make compiler happy. */ } else { if (sap->sg_len > NG_NODESIZ + offsetof(struct sockaddr_ng, sg_data)) { error = EINVAL; goto release; } len = sap->sg_len - offsetof(struct sockaddr_ng, sg_data); } /* * If the user used any of these ways to not specify an address * then handle specially. */ if ((sap == NULL) || (len <= 0) || (*sap->sg_data == '\0')) { if (NG_NODE_NUMHOOKS(pcbp->sockdata->node) != 1) { error = EDESTADDRREQ; goto release; } /* * If exactly one hook exists, just use it. * Special case to allow write(2) to work on an ng_socket. */ hook = LIST_FIRST(&pcbp->sockdata->node->nd_hooks); } else { if (len >= NG_HOOKSIZ) { error = EINVAL; goto release; } /* * chop off the sockaddr header, and make sure it's NUL * terminated */ bcopy(sap->sg_data, hookname, len); hookname[len] = '\0'; /* Find the correct hook from 'hookname' */ hook = ng_findhook(pcbp->sockdata->node, hookname); if (hook == NULL) { error = EHOSTUNREACH; goto release; } } /* Send data. */ item = ng_package_data(m, NG_WAITOK); m = NULL; NET_EPOCH_ENTER(et); NG_FWD_ITEM_HOOK(error, item, hook); NET_EPOCH_EXIT(et); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngd_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == NULL) return (EINVAL); return (ng_connect_data(nam, pcbp)); } /* * Used for both data and control sockets */ static int ng_getsockaddr(struct socket *so, struct sockaddr **addr) { struct ngpcb *pcbp; struct sockaddr_ng *sg; int sg_len; int error = 0; pcbp = sotongpcb(so); if ((pcbp == NULL) || (pcbp->sockdata == NULL)) /* XXXGL: can this still happen? */ return (EINVAL); sg_len = sizeof(struct sockaddr_ng) + NG_NODESIZ - sizeof(sg->sg_data); sg = malloc(sg_len, M_SONAME, M_WAITOK | M_ZERO); mtx_lock(&pcbp->sockdata->mtx); if (pcbp->sockdata->node != NULL) { node_p node = pcbp->sockdata->node; if (NG_NODE_HAS_NAME(node)) bcopy(NG_NODE_NAME(node), sg->sg_data, strlen(NG_NODE_NAME(node))); mtx_unlock(&pcbp->sockdata->mtx); sg->sg_len = sg_len; sg->sg_family = AF_NETGRAPH; *addr = (struct sockaddr *)sg; } else { mtx_unlock(&pcbp->sockdata->mtx); free(sg, M_SONAME); error = EINVAL; } return (error); } /* * Attach a socket to it's protocol specific partner. * For a control socket, actually create a netgraph node and attach * to it as well. */ static int ng_attach_cntl(struct socket *so) { struct ngsock *priv; struct ngpcb *pcbp; node_p node; int error; /* Setup protocol control block */ if ((error = ng_attach_common(so, NG_CONTROL)) != 0) return (error); pcbp = sotongpcb(so); /* Make the generic node components */ if ((error = ng_make_node_common(&typestruct, &node)) != 0) { ng_detach_common(pcbp, NG_CONTROL); return (error); } /* * Allocate node private info and hash. We start * with 16 hash entries, however we may grow later * in ngs_newhook(). We can't predict how much hooks * does this node plan to have. */ priv = malloc(sizeof(*priv), M_NETGRAPH_SOCK, M_WAITOK | M_ZERO); priv->hash = hashinit(16, M_NETGRAPH_SOCK, &priv->hmask); /* Initialize mutex. */ mtx_init(&priv->mtx, "ng_socket", NULL, MTX_DEF); /* Link the pcb the private data. */ priv->ctlsock = pcbp; pcbp->sockdata = priv; priv->refs++; priv->node = node; pcbp->node_id = node->nd_ID; /* hint for netstat(1) */ /* Link the node and the private data. */ NG_NODE_SET_PRIVATE(priv->node, priv); NG_NODE_REF(priv->node); priv->refs++; return (0); } static int ng_attach_data(struct socket *so) { return (ng_attach_common(so, NG_DATA)); } /* * Set up a socket protocol control block. * This code is shared between control and data sockets. */ static int ng_attach_common(struct socket *so, int type) { struct ngpcb *pcbp; int error; /* Standard socket setup stuff. */ error = soreserve(so, ngpdg_sendspace, ngpdg_recvspace); if (error) return (error); /* Allocate the pcb. */ pcbp = malloc(sizeof(struct ngpcb), M_PCB, M_WAITOK | M_ZERO); pcbp->type = type; /* Link the pcb and the socket. */ so->so_pcb = (caddr_t)pcbp; pcbp->ng_socket = so; /* Add the socket to linked list */ mtx_lock(&ngsocketlist_mtx); LIST_INSERT_HEAD(&ngsocklist, pcbp, socks); mtx_unlock(&ngsocketlist_mtx); return (0); } /* * Disassociate the socket from it's protocol specific * partner. If it's attached to a node's private data structure, * then unlink from that too. If we were the last socket attached to it, * then shut down the entire node. Shared code for control and data sockets. */ static void ng_detach_common(struct ngpcb *pcbp, int which) { struct ngsock *priv = pcbp->sockdata; if (priv != NULL) { mtx_lock(&priv->mtx); switch (which) { case NG_CONTROL: priv->ctlsock = NULL; break; case NG_DATA: priv->datasock = NULL; break; default: panic("%s", __func__); } pcbp->sockdata = NULL; pcbp->node_id = 0; ng_socket_free_priv(priv); } pcbp->ng_socket->so_pcb = NULL; mtx_lock(&ngsocketlist_mtx); LIST_REMOVE(pcbp, socks); mtx_unlock(&ngsocketlist_mtx); free(pcbp, M_PCB); } /* * Remove a reference from node private data. */ static void ng_socket_free_priv(struct ngsock *priv) { mtx_assert(&priv->mtx, MA_OWNED); priv->refs--; if (priv->refs == 0) { mtx_destroy(&priv->mtx); hashdestroy(priv->hash, M_NETGRAPH_SOCK, priv->hmask); free(priv, M_NETGRAPH_SOCK); return; } if ((priv->refs == 1) && (priv->node != NULL)) { node_p node = priv->node; priv->node = NULL; mtx_unlock(&priv->mtx); NG_NODE_UNREF(node); ng_rmnode_self(node); } else mtx_unlock(&priv->mtx); } /* * Connect the data socket to a named control socket node. */ static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp) { struct sockaddr_ng *sap; node_p farnode; struct ngsock *priv; int error; item_p item; /* If we are already connected, don't do it again. */ if (pcbp->sockdata != NULL) return (EISCONN); /* * Find the target (victim) and check it doesn't already have * a data socket. Also check it is a 'socket' type node. * Use ng_package_data() and ng_address_path() to do this. */ sap = (struct sockaddr_ng *) nam; /* The item will hold the node reference. */ item = ng_package_data(NULL, NG_WAITOK); if ((error = ng_address_path(NULL, item, sap->sg_data, 0))) return (error); /* item is freed on failure */ /* * Extract node from item and free item. Remember we now have * a reference on the node. The item holds it for us. * when we free the item we release the reference. */ farnode = item->el_dest; /* shortcut */ if (strcmp(farnode->nd_type->name, NG_SOCKET_NODE_TYPE) != 0) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EINVAL); } priv = NG_NODE_PRIVATE(farnode); if (priv->datasock != NULL) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EADDRINUSE); } /* * Link the PCB and the private data struct. and note the extra * reference. Drop the extra reference on the node. */ mtx_lock(&priv->mtx); priv->datasock = pcbp; pcbp->sockdata = priv; pcbp->node_id = priv->node->nd_ID; /* hint for netstat(1) */ priv->refs++; mtx_unlock(&priv->mtx); NG_FREE_ITEM(item); /* drop the reference to the node */ return (0); } /* * Binding a socket means giving the corresponding node a name */ static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp) { struct ngsock *const priv = pcbp->sockdata; struct sockaddr_ng *const sap = (struct sockaddr_ng *) nam; if (priv == NULL) { TRAP_ERROR; return (EINVAL); } if ((sap->sg_len < 4) || (sap->sg_len > (NG_NODESIZ + 2)) || (sap->sg_data[0] == '\0') || (sap->sg_data[sap->sg_len - 3] != '\0')) { TRAP_ERROR; return (EINVAL); } return (ng_name_node(priv->node, sap->sg_data)); } /*************************************************************** Netgraph node ***************************************************************/ /* * You can only create new nodes from the socket end of things. */ static int ngs_constructor(node_p nodep) { return (EINVAL); } static void ngs_rehash(node_p node) { struct ngsock *priv = NG_NODE_PRIVATE(node); struct ngshash *new; struct hookpriv *hp; hook_p hook; uint32_t h; u_long hmask; new = hashinit_flags((priv->hmask + 1) * 2, M_NETGRAPH_SOCK, &hmask, HASH_NOWAIT); if (new == NULL) return; LIST_FOREACH(hook, &node->nd_hooks, hk_hooks) { hp = NG_HOOK_PRIVATE(hook); #ifdef INVARIANTS LIST_REMOVE(hp, next); #endif h = hash32_str(NG_HOOK_NAME(hook), HASHINIT) & hmask; LIST_INSERT_HEAD(&new[h], hp, next); } hashdestroy(priv->hash, M_NETGRAPH_SOCK, priv->hmask); priv->hash = new; priv->hmask = hmask; } /* * We allow any hook to be connected to the node. * There is no per-hook private information though. */ static int ngs_newhook(node_p node, hook_p hook, const char *name) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct hookpriv *hp; uint32_t h; hp = malloc(sizeof(*hp), M_NETGRAPH_SOCK, M_NOWAIT); if (hp == NULL) return (ENOMEM); if (node->nd_numhooks * 2 > priv->hmask) ngs_rehash(node); hp->hook = hook; h = hash32_str(name, HASHINIT) & priv->hmask; LIST_INSERT_HEAD(&priv->hash[h], hp, next); NG_HOOK_SET_PRIVATE(hook, hp); return (0); } /* * If only one hook, allow read(2) and write(2) to work. */ static int ngs_connect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *priv = NG_NODE_PRIVATE(node); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; else priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } return (0); } /* Look up hook by name */ static hook_p ngs_findhook(node_p node, const char *name) { struct ngsock *priv = NG_NODE_PRIVATE(node); struct hookpriv *hp; uint32_t h; /* * Microoptimisation for an ng_socket with * a single hook, which is a common case. */ if (node->nd_numhooks == 1) { hook_p hook; hook = LIST_FIRST(&node->nd_hooks); if (strcmp(NG_HOOK_NAME(hook), name) == 0) return (hook); else return (NULL); } h = hash32_str(name, HASHINIT) & priv->hmask; LIST_FOREACH(hp, &priv->hash[h], next) if (strcmp(NG_HOOK_NAME(hp->hook), name) == 0) return (hp->hook); return (NULL); } /* * Incoming messages get passed up to the control socket. * Unless they are for us specifically (socket_type) */ static int ngs_rcvmsg(node_p node, item_p item, hook_p lasthook) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *pcbp; struct socket *so; struct sockaddr_ng addr; struct ng_mesg *msg; struct mbuf *m; ng_ID_t retaddr = NGI_RETADDR(item); int addrlen; int error = 0; NGI_GET_MSG(item, msg); NG_FREE_ITEM(item); /* * Grab priv->mtx here to prevent destroying of control socket * after checking that priv->ctlsock is not NULL. */ mtx_lock(&priv->mtx); pcbp = priv->ctlsock; /* * Only allow mesgs to be passed if we have the control socket. * Data sockets can only support the generic messages. */ if (pcbp == NULL) { mtx_unlock(&priv->mtx); TRAP_ERROR; NG_FREE_MSG(msg); return (EINVAL); } so = pcbp->ng_socket; SOCKBUF_LOCK(&so->so_rcv); /* As long as the race is handled, priv->mtx may be unlocked now. */ mtx_unlock(&priv->mtx); #ifdef TRACE_MESSAGES printf("[%x]:---------->[socket]: c=<%d>cmd=%x(%s) f=%x #%d\n", retaddr, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token); #endif if (msg->header.typecookie == NGM_SOCKET_COOKIE) { switch (msg->header.cmd) { case NGM_SOCK_CMD_NOLINGER: priv->flags |= NGS_FLAG_NOLINGER; break; case NGM_SOCK_CMD_LINGER: priv->flags &= ~NGS_FLAG_NOLINGER; break; default: error = EINVAL; /* unknown command */ } SOCKBUF_UNLOCK(&so->so_rcv); /* Free the message and return. */ NG_FREE_MSG(msg); return (error); } /* Get the return address into a sockaddr. */ bzero(&addr, sizeof(addr)); addr.sg_len = sizeof(addr); addr.sg_family = AF_NETGRAPH; addrlen = snprintf((char *)&addr.sg_data, sizeof(addr.sg_data), "[%x]:", retaddr); if (addrlen < 0 || addrlen > sizeof(addr.sg_data)) { SOCKBUF_UNLOCK(&so->so_rcv); printf("%s: snprintf([%x]) failed - %d\n", __func__, retaddr, addrlen); NG_FREE_MSG(msg); return (EINVAL); } /* Copy the message itself into an mbuf chain. */ m = m_devget((caddr_t)msg, sizeof(struct ng_mesg) + msg->header.arglen, 0, NULL, NULL); /* * Here we free the message. We need to do that * regardless of whether we got mbufs. */ NG_FREE_MSG(msg); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_rcv); TRAP_ERROR; return (ENOBUFS); } /* Send it up to the socket. */ if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&addr, m, NULL) == 0) { soroverflow_locked(so); TRAP_ERROR; m_freem(m); return (ENOBUFS); } /* sorwakeup_locked () releases the lock internally. */ sorwakeup_locked(so); return (error); } /* * Receive data on a hook */ static int ngs_rcvdata(hook_p hook, item_p item) { struct ngsock *const priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); struct ngpcb *const pcbp = priv->datasock; struct socket *so; struct sockaddr_ng *addr; char *addrbuf[NG_HOOKSIZ + 4]; int addrlen; struct mbuf *m; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* If there is no data socket, black-hole it. */ if (pcbp == NULL) { NG_FREE_M(m); return (0); } so = pcbp->ng_socket; /* Get the return address into a sockaddr. */ addrlen = strlen(NG_HOOK_NAME(hook)); /* <= NG_HOOKSIZ - 1 */ addr = (struct sockaddr_ng *) addrbuf; addr->sg_len = addrlen + 3; addr->sg_family = AF_NETGRAPH; bcopy(NG_HOOK_NAME(hook), addr->sg_data, addrlen); addr->sg_data[addrlen] = '\0'; /* Try to tell the socket which hook it came in on. */ SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)addr, m, NULL) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(m); TRAP_ERROR; return (ENOBUFS); } /* sorwakeup_locked () releases the lock internally. */ sorwakeup_locked(so); return (0); } /* * Hook disconnection * * For this type, removal of the last link destroys the node * if the NOLINGER flag is set. */ static int ngs_disconnect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *const priv = NG_NODE_PRIVATE(node); struct hookpriv *hp = NG_HOOK_PRIVATE(hook); LIST_REMOVE(hp, next); free(hp, M_NETGRAPH_SOCK); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; else priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } if ((priv->flags & NGS_FLAG_NOLINGER) && (NG_NODE_NUMHOOKS(node) == 0) && (NG_NODE_IS_VALID(node))) ng_rmnode_self(node); return (0); } /* * Do local shutdown processing. * In this case, that involves making sure the socket * knows we should be shutting down. */ static int ngs_shutdown(node_p node) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *dpcbp, *pcbp; mtx_lock(&priv->mtx); dpcbp = priv->datasock; pcbp = priv->ctlsock; if (dpcbp != NULL) soisdisconnected(dpcbp->ng_socket); if (pcbp != NULL) soisdisconnected(pcbp->ng_socket); priv->node = NULL; NG_NODE_SET_PRIVATE(node, NULL); ng_socket_free_priv(priv); NG_NODE_UNREF(node); return (0); } static void ng_socket_item_applied(void *context, int error) { struct ngsock *const priv = (struct ngsock *)context; mtx_lock(&priv->mtx); priv->error = error; wakeup(priv); mtx_unlock(&priv->mtx); } static int dummy_disconnect(struct socket *so) { return (0); } /* * Control and data socket type descriptors * * XXXRW: Perhaps _close should do something? */ static struct pr_usrreqs ngc_usrreqs = { .pru_abort = NULL, .pru_attach = ngc_attach, .pru_bind = ngc_bind, .pru_connect = ngc_connect, .pru_detach = ngc_detach, .pru_disconnect = dummy_disconnect, .pru_peeraddr = NULL, .pru_send = ngc_send, .pru_shutdown = NULL, .pru_sockaddr = ng_getsockaddr, .pru_close = NULL, }; static struct pr_usrreqs ngd_usrreqs = { .pru_abort = NULL, .pru_attach = ngd_attach, .pru_bind = NULL, .pru_connect = ngd_connect, .pru_detach = ngd_detach, .pru_disconnect = dummy_disconnect, .pru_peeraddr = NULL, .pru_send = ngd_send, .pru_shutdown = NULL, .pru_sockaddr = ng_getsockaddr, .pru_close = NULL, }; /* * Definitions of protocols supported in the NETGRAPH domain. */ extern struct domain ngdomain; /* stop compiler warnings */ static struct protosw ngsw[] = { { .pr_type = SOCK_DGRAM, .pr_domain = &ngdomain, .pr_protocol = NG_CONTROL, .pr_flags = PR_ATOMIC | PR_ADDR /* | PR_RIGHTS */, .pr_usrreqs = &ngc_usrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &ngdomain, .pr_protocol = NG_DATA, .pr_flags = PR_ATOMIC | PR_ADDR, .pr_usrreqs = &ngd_usrreqs } }; struct domain ngdomain = { .dom_family = AF_NETGRAPH, .dom_name = "netgraph", .dom_protosw = ngsw, .dom_protoswNPROTOSW = &ngsw[nitems(ngsw)] }; /* * Handle loading and unloading for this node type. * This is to handle auxiliary linkages (e.g protocol domain addition). */ static int ngs_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: mtx_init(&ngsocketlist_mtx, "ng_socketlist", NULL, MTX_DEF); break; case MOD_UNLOAD: /* Ensure there are no open netgraph sockets. */ if (!LIST_EMPTY(&ngsocklist)) { error = EBUSY; break; } #ifdef NOTYET /* Unregister protocol domain XXX can't do this yet.. */ #endif error = EBUSY; break; default: error = EOPNOTSUPP; break; } return (error); } -VNET_DOMAIN_SET(ng); +DOMAIN_SET(ng); SYSCTL_INT(_net_graph, OID_AUTO, family, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, AF_NETGRAPH, ""); static SYSCTL_NODE(_net_graph, OID_AUTO, data, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DATA"); SYSCTL_INT(_net_graph_data, OID_AUTO, proto, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_DATA, ""); static SYSCTL_NODE(_net_graph, OID_AUTO, control, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "CONTROL"); SYSCTL_INT(_net_graph_control, OID_AUTO, proto, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_CONTROL, ""); diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 035051c6f5da..b9f506518cce 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -1,341 +1,341 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mrouting.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include /* * While this file provides the domain and protocol switch tables for IPv4, it * also provides the sysctl node declarations for net.inet.* often shared with * IPv6 for common features or by upper layer protocols. In case of no IPv4 * support compile out everything but these sysctl nodes. */ #ifdef INET #include #include #include #include #endif /* INET */ #if defined(INET) || defined(INET6) #include #endif #ifdef INET #include #include #include #include #include #include #include #include #include #include #include #include /* * TCP/IP protocol family: IP, ICMP, UDP, TCP. */ static struct pr_usrreqs nousrreqs; #ifdef SCTP #include #include #include #include #endif FEATURE(inet, "Internet Protocol version 4"); extern struct domain inetdomain; /* Spacer for loadable protocols. */ #define IPPROTOSPACER \ { \ .pr_domain = &inetdomain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inetsw[] = { { .pr_type = 0, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IP, .pr_flags = PR_CAPATTACH, .pr_slowtimo = ip_slowtimo, .pr_drain = ip_drain, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| PR_CAPATTACH, .pr_input = tcp_input, .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp_input, .pr_ctlinput = udplite_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ICMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IGMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = igmp_input, .pr_ctloutput = rip_ctloutput, .pr_fasttimo = igmp_fasttimo, .pr_slowtimo = igmp_slowtimo, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RSVP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = rsvp_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_MOBILE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, # ifdef INET6 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #endif { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, }; struct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[nitems(inetsw)], .dom_rtattach = in_inithead, #ifdef VIMAGE .dom_rtdetach = in_detachhead, #endif .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach }; -VNET_DOMAIN_SET(inet); +DOMAIN_SET(inet); #endif /* INET */ SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet Family"); SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP"); SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP"); SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP"); SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP"); #endif SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IGMP"); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* XXX no protocol # to use, pick something "reserved" */ SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC"); SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "AH"); SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ESP"); SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "RAW"); SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Accept filters"); diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 44eb0fa4ee62..6ccd90ac6fdd 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -1,607 +1,607 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_sctp.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ FEATURE(inet6, "Internet Protocol version 6"); extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 /* Spacer for loadable protocols. */ #define IP6PROTOSPACER \ { \ .pr_domain = &inet6domain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inet6sw[] = { { .pr_type = 0, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_CAPATTACH, .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| PR_LISTEN|PR_CAPATTACH, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, #ifndef INET /* don't call initialization, timeout, and drain routines twice */ .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, #endif .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, #ifndef INET /* Do not call initialization and drain routines twice. */ .pr_drain = sctp_drain, #endif .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp6_input, .pr_ctlinput = udplite6_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_fasttimo = icmp6_fasttimo, .pr_slowtimo = icmp6_slowtimo, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, .pr_usrreqs = &nousrreqs }, #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, /* Spacer n-times for loadable protocols. */ IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, .dom_protoswNPROTOSW = (struct protosw *)&inet6sw[nitems(inet6sw)], .dom_rtattach = in6_inithead, #ifdef VIMAGE .dom_rtdetach = in6_detachhead, #endif .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_ifmtu = in6_domifmtu }; -VNET_DOMAIN_SET(inet6); +DOMAIN_SET(inet6); /* * Internet configuration info */ #ifndef IPV6FORWARDING #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif /* GATEWAY6 */ #endif /* !IPV6FORWARDING */ #ifndef IPV6_SENDREDIRECTS #define IPV6_SENDREDIRECTS 1 #endif VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING; /* act as router? */ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ VNET_DEFINE(int, ip6_dad_count) = 1; /* DupAddrDetectionTransmits */ VNET_DEFINE(int, ip6_auto_flowlabel) = 1; VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr * (RFC2462 5.5.4) */ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix * walk list every 5 sec. */ VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */ VNET_DEFINE(int, ip6_v6only) = 1; VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L; #ifdef IPSTEALTH VNET_DEFINE(int, ip6stealth) = 0; #endif VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS * (RFC 4861) */ /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ VNET_DEFINE(int, pmtu_expire) = 60*10; VNET_DEFINE(int, pmtu_probe) = 60*2; /* ICMPV6 parameters */ VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */ VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ VNET_DEFINE(int, icmp6errppslim) = 100; /* 100pps */ /* control how to respond to NI queries */ VNET_DEFINE(int, icmp6_nodeinfo) = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1; /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP6"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP6"); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_preferred_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_desync_factor + V_ip6_temp_regen_advance) return (EINVAL); V_ip6_temp_preferred_lifetime = val; return (0); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_valid_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_temp_preferred_lifetime) return (EINVAL); V_ip6_temp_valid_lifetime = val; return (0); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "Enable forwarding of IPv6 packets between interfaces"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, "Send ICMPv6 redirects for unforwardable IPv6 packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0, "Default hop limit to use for outgoing IPv6 packets"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 RA messages"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, "Default value of per-interface flag to control whether routers " "sending ICMPv6 RA messages on that interface are added into the " "default router list"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0, "Always set clear the R flag in ICMPv6 NA messages when accepting RA " "on the interface"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, "Accept the default router list from ICMPv6 RA messages even " "when packet forwarding is enabled"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "Frequency in seconds at which to log IPv6 forwarding errors"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "Default maximum number of IPv6 extension headers permitted on " "incoming IPv6 packets, 0 for no artificial limit"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, "Number of ICMPv6 NS messages sent during duplicate address detection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "Provide an IPv6 flowlabel in outbound packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "Default hop limit for IPv6 multicast packets originating from this " "node"); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, "KAME version string"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "Allow the use of addresses whose preferred lifetimes have expired"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "Create RFC3041 temporary addresses for autoconfigured addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_temppltime, "I", "Maximum preferred lifetime for temporary addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_tempvltime, "I", "Maximum valid lifetime for temporary addresses"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, "Restrict AF_INET6 sockets to IPv6 addresses only"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, "Default value of per-interface flag for automatically adding an IPv6 " "link-local address to interfaces when attached"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, struct rip6stat, rip6stat, "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "Prefer RFC3041 temporary addresses in source address selection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, "Use the default scope zone when none is specified"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "Enable path MTU discovery for multicast packets"); #ifdef IPSTEALTH SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, "Forward IPv6 packets without decrementing their TTL"); #endif /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "Accept ICMPv6 redirect messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "Delay in seconds before expiring redirect route"); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, "Frequency in seconds of checks for expired prefixes and routers"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, "Delay in seconds before probing for reachability"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "Number of ICMPv6 NS messages sent during reachability detection"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "Number of ICMPv6 NS messages sent during address resolution"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "Create a loopback route when configuring an IPv6 address"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "Mask of enabled RFC4620 node information query types"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, "Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup " "for compatibility with KAME implementation"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "Maximum number of ICMPv6 error messages per second"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, "Log NDP debug messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861"); #ifdef EXPERIMENTAL SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_ignore_ipv6_only_ra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_ignore_ipv6_only_ra), 0, "Ignore the 'IPv6-Only flag' in RA messages in compliance with " "draft-ietf-6man-ipv6only-flag"); #endif diff --git a/sys/sys/domain.h b/sys/sys/domain.h index c9defe8e4341..409372e91915 100644 --- a/sys/sys/domain.h +++ b/sys/sys/domain.h @@ -1,105 +1,92 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)domain.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_DOMAIN_H_ #define _SYS_DOMAIN_H_ /* * Structure per communications domain. */ /* * Forward structure declarations for function prototypes [sic]. */ struct mbuf; struct ifnet; struct socket; struct rib_head; struct domain { int dom_family; /* AF_xxx */ char *dom_name; int dom_flags; int (*dom_probe)(void); /* check for support (optional) */ int (*dom_externalize) /* externalize access rights */ (struct mbuf *, struct mbuf **, int); void (*dom_dispose) /* dispose of internalized rights */ (struct socket *); struct protosw *dom_protosw, *dom_protoswNPROTOSW; struct domain *dom_next; struct rib_head *(*dom_rtattach) /* initialize routing table */ (uint32_t); void (*dom_rtdetach) /* clean up routing table */ (struct rib_head *); void *(*dom_ifattach)(struct ifnet *); void (*dom_ifdetach)(struct ifnet *, void *); int (*dom_ifmtu)(struct ifnet *); /* af-dependent data on ifnet */ }; /* dom_flags */ #define DOMF_SUPPORTED 0x0001 /* System supports this domain. */ #define DOMF_INITED 0x0002 /* Initialized in the default vnet. */ #ifdef _KERNEL extern int domain_init_status; extern struct domain *domains; void domain_add(void *); void domain_init(void *); #ifdef VIMAGE void vnet_domain_init(void *); void vnet_domain_uninit(void *); #endif #define DOMAIN_SET(name) \ SYSINIT(domain_add_ ## name, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_FIRST, domain_add, & name ## domain); \ SYSINIT(domain_init_ ## name, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_SECOND, domain_init, & name ## domain); -#ifdef VIMAGE -#define VNET_DOMAIN_SET(name) \ - SYSINIT(domain_add_ ## name, SI_SUB_PROTO_DOMAIN, \ - SI_ORDER_FIRST, domain_add, & name ## domain); \ - VNET_SYSINIT(vnet_domain_init_ ## name, SI_SUB_PROTO_DOMAIN, \ - SI_ORDER_SECOND, vnet_domain_init, & name ## domain); \ - VNET_SYSUNINIT(vnet_domain_uninit_ ## name, \ - SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, vnet_domain_uninit, \ - & name ## domain) -#else /* !VIMAGE */ -#define VNET_DOMAIN_SET(name) DOMAIN_SET(name) -#endif /* VIMAGE */ - #endif /* _KERNEL */ #endif /* !_SYS_DOMAIN_H_ */