Index: projects/nfs-over-tls/sys/kern/uipc_socket.c =================================================================== --- projects/nfs-over-tls/sys/kern/uipc_socket.c (revision 361061) +++ projects/nfs-over-tls/sys/kern/uipc_socket.c (revision 361062) @@ -1,4387 +1,4412 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_locked(void *); static void so_rdknl_assert_unlocked(void *); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_locked(void *); static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; so->so_options = head->so_options & ~SO_ACCEPTCONN; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #ifdef SCTP /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(so->so_count == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ refcount_release(&so->so_count); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); soabort(sp); } else /* sp is now in sofree() */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_NOMAP | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { /* * Note that error is intentionally * ignored. * * Like sendfile(), we rely on the * completion routine (pru_ready()) * to free the mbufs in the event that * pru_send() encountered an error and * did not append them to the sockbuf. */ soref(so); ktls_enqueue(top, so, tls_enq_cnt); } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0 || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL && (m->m_flags & M_NOTAVAIL) == 0) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m == NULL && so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else if (m != NULL && (m->m_flags & M_NOTAVAIL) == 0) goto dontblock; } for (; m != NULL && (m->m_flags & M_NOTAVAIL) == 0; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; + struct cmsghdr *cmsg; + struct tls_get_record tgr; + + /* + * For MSG_TLSAPPDATA, check for a non-application data + * record. If found, then return ENXIO without removing + * it from the receive queue. This allows a subsequent + * call without MSG_TLSAPPDATA to receive it. + * Note that, for TLS, there should only be a single + * control mbuf with the TLS_GET_RECORD message in it. + */ + if (flags & MSG_TLSAPPDATA) { + cmsg = mtod(m, struct cmsghdr *); + if (cmsg->cmsg_type == TLS_GET_RECORD && + cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { + memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); + /* This will need to change for TLS 1.3. */ + if (tgr.tls_type != TLS_RLTYPE_APP) { + SOCKBUF_UNLOCK(&so->so_rcv); +printf("fnd non-app rec=%d\n", tgr.tls_type); + error = ENXIO; + goto release; + } + } + } do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_NOMAP) != 0) error = m_unmappedtouio(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); sbunlock(sb); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_WANT_KTLS: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_WANT_KTLS: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } /* Not the last one, as so holds a ref. */ refcount_release(&head->so_count); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_locked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } static void so_rdknl_assert_unlocked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_locked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } static void so_wrknl_assert_unlocked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } Index: projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctls_impl.c =================================================================== --- projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctls_impl.c (revision 361061) +++ projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctls_impl.c (revision 361062) @@ -1,609 +1,656 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson * Developed with Red Inc: Alfred Perlstein * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Modified from the kernel GSSAPI code for RPC-over-TLS. */ #include __FBSDID("$FreeBSD$"); #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rpctlscd.h" #include "rpctlssd.h" extern struct fileops badfileops; /* * Syscall hooks */ static struct syscall_helper_data rpctls_syscalls[] = { SYSCALL_INIT_HELPER(gssd_syscall), SYSCALL_INIT_LAST }; #ifdef notnow struct rpctls_syscall_args { char op_l_[PADL_(int)]; int op; char op_r_[PADR_(int)]; char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; char s_l_[PADL_(int)]; int s; char s_r_[PADR_(int)]; }; #endif static CLIENT *rpctls_connect_handle; static struct mtx rpctls_connect_lock; static struct socket *rpctls_connect_so = NULL; static CLIENT *rpctls_server_handle; static struct mtx rpctls_server_lock; static struct socket *rpctls_server_so = NULL; static struct opaque_auth rpctls_null_verf; static CLIENT *rpctls_connect_client(void); static CLIENT *rpctls_server_client(void); static enum clnt_stat rpctls_server(struct socket *so, uint32_t *flags, uint64_t *sslp, uid_t *uid, int *ngrps, gid_t **gids); int rpctls_init(void) { int error; error = syscall_helper_register(rpctls_syscalls, SY_THR_STATIC_KLD); if (error != 0) { printf("rpctls_init: cannot register syscall\n"); return (error); } mtx_init(&rpctls_connect_lock, "rpctls_connect_lock", NULL, MTX_DEF); mtx_init(&rpctls_server_lock, "rpctls_server_lock", NULL, MTX_DEF); rpctls_null_verf.oa_flavor = AUTH_NULL; rpctls_null_verf.oa_base = RPCTLS_START_STRING; rpctls_null_verf.oa_length = strlen(RPCTLS_START_STRING); return (0); } int sys_gssd_syscall(struct thread *td, struct gssd_syscall_args *uap) { struct sockaddr_un sun; struct netconfig *nconf; struct file *fp; struct socket *so; char path[MAXPATHLEN], *pathp; int fd = -1, error, retry_count = 5; CLIENT *cl, *oldcl; bool ssd; #ifdef KERN_TLS u_int maxlen; #endif printf("in gssd syscall\n"); error = priv_check(td, PRIV_NFS_DAEMON); printf("aft priv_check=%d\n", error); if (error != 0) return (error); #ifdef notyet switch (uap->op) { case RPCTLS_SYSC_SETPATH: #else error = copyinstr(uap->path, path, sizeof(path), NULL); printf("setting err=%d path=%s\n", error, path); if (error != 0) return (error); if (path[0] == 'S') { ssd = true; pathp = &path[1]; } else { ssd = false; pathp = &path[0]; } if (pathp[0] == '/' || pathp[0] == '\0') { #endif if (ssd) { if (error == 0 && strlen(pathp) + 1 > sizeof(sun.sun_path)) error = EINVAL; if (error == 0 && pathp[0] != '\0') { sun.sun_family = AF_LOCAL; strlcpy(sun.sun_path, pathp, sizeof(sun.sun_path)); sun.sun_len = SUN_LEN(&sun); nconf = getnetconfigent("local"); cl = clnt_reconnect_create(nconf, (struct sockaddr *)&sun, RPCTLSSD, RPCTLSSDVERS, RPC_MAXDATASIZE, RPC_MAXDATASIZE); printf("got cl=%p\n", cl); /* * The number of retries defaults to INT_MAX, which * effectively means an infinite, uninterruptable loop. * Limiting it to five retries keeps it from running * forever. */ if (cl != NULL) CLNT_CONTROL(cl, CLSET_RETRIES, &retry_count); } else cl = NULL; mtx_lock(&rpctls_server_lock); oldcl = rpctls_server_handle; rpctls_server_handle = cl; mtx_unlock(&rpctls_server_lock); printf("cl=%p oldcl=%p\n", cl, oldcl); if (oldcl != NULL) { CLNT_CLOSE(oldcl); CLNT_RELEASE(oldcl); } } else { if (error == 0 && strlen(pathp) + 1 > sizeof(sun.sun_path)) error = EINVAL; if (error == 0 && pathp[0] != '\0') { sun.sun_family = AF_LOCAL; strlcpy(sun.sun_path, pathp, sizeof(sun.sun_path)); sun.sun_len = SUN_LEN(&sun); nconf = getnetconfigent("local"); cl = clnt_reconnect_create(nconf, (struct sockaddr *)&sun, RPCTLSCD, RPCTLSCDVERS, RPC_MAXDATASIZE, RPC_MAXDATASIZE); printf("got cl=%p\n", cl); /* * The number of retries defaults to INT_MAX, which * effectively means an infinite, uninterruptable loop. * Limiting it to five retries keeps it from running * forever. */ if (cl != NULL) CLNT_CONTROL(cl, CLSET_RETRIES, &retry_count); } else cl = NULL; mtx_lock(&rpctls_connect_lock); oldcl = rpctls_connect_handle; rpctls_connect_handle = cl; mtx_unlock(&rpctls_connect_lock); printf("cl=%p oldcl=%p\n", cl, oldcl); if (oldcl != NULL) { CLNT_CLOSE(oldcl); CLNT_RELEASE(oldcl); } } } else if (path[0] == 'C') { printf("In connect\n"); error = EINVAL; #ifdef KERN_TLS if (PMAP_HAS_DMAP != 0 && mb_use_ext_pgs && rpctls_getinfo(&maxlen)) error = 0; #endif if (error == 0) error = falloc(td, &fp, &fd, 0); if (error == 0) { printf("falloc=%d fd=%d\n", error, fd); mtx_lock(&rpctls_connect_lock); so = rpctls_connect_so; rpctls_connect_so = NULL; mtx_unlock(&rpctls_connect_lock); finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops); td->td_retval[0] = fd; } printf("returning=%d\n", fd); } else if (path[0] == 'E') { printf("In srvconnect\n"); error = EINVAL; #ifdef KERN_TLS if (PMAP_HAS_DMAP != 0 && mb_use_ext_pgs && rpctls_getinfo(&maxlen)) error = 0; #endif if (error == 0) error = falloc(td, &fp, &fd, 0); if (error == 0) { printf("srv falloc=%d fd=%d\n", error, fd); mtx_lock(&rpctls_server_lock); so = rpctls_server_so; rpctls_server_so = NULL; mtx_unlock(&rpctls_server_lock); finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops); td->td_retval[0] = fd; } printf("srv returning=%d\n", fd); } else if (path[0] == 'F') { printf("In EOserver\n"); fd = strtol(&path[1], NULL, 10); printf("srv fd=%d\n", fd); if (fd >= 0) { error = kern_close(td, fd); printf("srv aft kern_close=%d\n", error); } else { printf("rpctlss fd negative\n"); error = EINVAL; } } return (error); } /* * Acquire the rpctls_connect_handle and return it with a reference count, * if it is available. */ static CLIENT * rpctls_connect_client(void) { CLIENT *cl; mtx_lock(&rpctls_connect_lock); cl = rpctls_connect_handle; if (cl != NULL) CLNT_ACQUIRE(cl); mtx_unlock(&rpctls_connect_lock); return (cl); } /* * Acquire the rpctls_server_handle and return it with a reference count, * if it is available. */ static CLIENT * rpctls_server_client(void) { CLIENT *cl; mtx_lock(&rpctls_server_lock); cl = rpctls_server_handle; if (cl != NULL) CLNT_ACQUIRE(cl); mtx_unlock(&rpctls_server_lock); return (cl); } /* Do an upcall for a new socket connect using TLS. */ enum clnt_stat rpctls_connect(CLIENT *newclient, struct socket *so, uint64_t *sslp) { struct rpctlscd_connect_res res; struct rpc_callextra ext; struct timeval utimeout; enum clnt_stat stat; CLIENT *cl; int val; static bool rpctls_connect_busy = false; printf("In rpctls_connect\n"); cl = rpctls_connect_client(); printf("connect_client=%p\n", cl); if (cl == NULL) return (RPC_AUTHERROR); /* First, do the AUTH_TLS NULL RPC. */ memset(&ext, 0, sizeof(ext)); utimeout.tv_sec = 30; utimeout.tv_usec = 0; ext.rc_auth = authtls_create(); printf("authtls=%p\n", ext.rc_auth); stat = clnt_call_private(newclient, &ext, NULLPROC, (xdrproc_t)xdr_void, NULL, (xdrproc_t)xdr_void, NULL, utimeout); printf("aft NULLRPC=%d\n", stat); AUTH_DESTROY(ext.rc_auth); if (stat == RPC_AUTHERROR) return (stat); if (stat != RPC_SUCCESS) return (RPC_SYSTEMERROR); /* Serialize the connect upcalls. */ mtx_lock(&rpctls_connect_lock); while (rpctls_connect_busy) msleep(&rpctls_connect_busy, &rpctls_connect_lock, PVFS, "rtlscn", 0); rpctls_connect_busy = true; rpctls_connect_so = so; mtx_unlock(&rpctls_connect_lock); printf("rpctls_conect so=%p\n", so); /* Temporarily block reception during the handshake upcall. */ val = 1; CLNT_CONTROL(newclient, CLSET_BLOCKRCV, &val); /* Do the connect handshake upcall. */ stat = rpctlscd_connect_1(NULL, &res, cl); printf("aft connect upcall=%d\n", stat); if (stat == RPC_SUCCESS) { *sslp++ = res.sec; *sslp++ = res.usec; *sslp = res.ssl; } CLNT_RELEASE(cl); /* Unblock reception. */ val = 0; CLNT_CONTROL(newclient, CLSET_BLOCKRCV, &val); /* Once the upcall is done, the daemon is done with the fp and so. */ mtx_lock(&rpctls_connect_lock); rpctls_connect_so = NULL; rpctls_connect_busy = false; wakeup(&rpctls_connect_busy); mtx_unlock(&rpctls_connect_lock); printf("aft wakeup\n"); return (stat); } +/* Do an upcall to handle an non-application data record using TLS. */ +enum clnt_stat +rpctls_cl_handlerecord(uint64_t sec, uint64_t usec, uint64_t ssl) +{ + struct rpctlscd_handlerecord_arg arg; + enum clnt_stat stat; + CLIENT *cl; + +printf("In rpctls_cl_handlerecord\n"); + cl = rpctls_connect_client(); +printf("handlerecord_client=%p\n", cl); + if (cl == NULL) + return (RPC_FAILED); + + /* Do the handlerecord upcall. */ + arg.sec = sec; + arg.usec = usec; + arg.ssl = ssl; + stat = rpctlscd_handlerecord_1(&arg, NULL, cl); +printf("aft handlerecord upcall=%d\n", stat); + CLNT_RELEASE(cl); + return (stat); +} + +enum clnt_stat +rpctls_srv_handlerecord(uint64_t sec, uint64_t usec, uint64_t ssl) +{ + struct rpctlssd_handlerecord_arg arg; + enum clnt_stat stat; + CLIENT *cl; + +printf("In rpctls_srv_handlerecord\n"); + cl = rpctls_server_client(); +printf("srv handlerecord_client=%p\n", cl); + if (cl == NULL) + return (RPC_FAILED); + + /* Do the handlerecord upcall. */ + arg.sec = sec; + arg.usec = usec; + arg.ssl = ssl; + stat = rpctlssd_handlerecord_1(&arg, NULL, cl); +printf("aft srv handlerecord upcall=%d\n", stat); + CLNT_RELEASE(cl); + return (stat); +} + /* Do an upcall to shut down a socket using TLS. */ enum clnt_stat rpctls_cl_disconnect(uint64_t sec, uint64_t usec, uint64_t ssl) { struct rpctlscd_disconnect_arg arg; enum clnt_stat stat; CLIENT *cl; printf("In rpctls_cl_disconnect\n"); cl = rpctls_connect_client(); printf("disconnect_client=%p\n", cl); if (cl == NULL) return (RPC_FAILED); /* Do the disconnect upcall. */ arg.sec = sec; arg.usec = usec; arg.ssl = ssl; stat = rpctlscd_disconnect_1(&arg, NULL, cl); printf("aft disconnect upcall=%d\n", stat); CLNT_RELEASE(cl); return (stat); } enum clnt_stat rpctls_srv_disconnect(uint64_t sec, uint64_t usec, uint64_t ssl) { struct rpctlssd_disconnect_arg arg; enum clnt_stat stat; CLIENT *cl; printf("In rpctls_srv_disconnect\n"); cl = rpctls_server_client(); printf("srv disconnect_client=%p\n", cl); if (cl == NULL) return (RPC_FAILED); /* Do the disconnect upcall. */ arg.sec = sec; arg.usec = usec; arg.ssl = ssl; stat = rpctlssd_disconnect_1(&arg, NULL, cl); printf("aft srv disconnect upcall=%d\n", stat); CLNT_RELEASE(cl); return (stat); } /* Do an upcall for a new server socket using TLS. */ static enum clnt_stat rpctls_server(struct socket *so, uint32_t *flags, uint64_t *sslp, uid_t *uid, int *ngrps, gid_t **gids) { enum clnt_stat stat; CLIENT *cl; struct rpctlssd_connect_res res; gid_t *gidp; uint32_t *gidv; int i; static bool rpctls_server_busy = false; printf("In rpctls_server\n"); cl = rpctls_server_client(); printf("server_client=%p\n", cl); if (cl == NULL) return (RPC_SYSTEMERROR); /* Serialize the server upcalls. */ mtx_lock(&rpctls_server_lock); while (rpctls_server_busy) msleep(&rpctls_server_busy, &rpctls_server_lock, PVFS, "rtlssn", 0); rpctls_server_busy = true; rpctls_server_so = so; mtx_unlock(&rpctls_server_lock); printf("rpctls_conect so=%p\n", so); /* Do the server upcall. */ stat = rpctlssd_connect_1(NULL, &res, cl); if (stat == RPC_SUCCESS) { *flags = res.flags; *sslp++ = res.sec; *sslp++ = res.usec; *sslp = res.ssl; if ((*flags & (RPCTLS_FLAGS_CERTUSER | RPCTLS_FLAGS_DISABLED)) == RPCTLS_FLAGS_CERTUSER) { *ngrps = res.gid.gid_len; *uid = res.uid; *gids = gidp = mem_alloc(*ngrps * sizeof(gid_t)); gidv = res.gid.gid_val; printf("got uid=%d ngrps=%d gidv=%p gids=%p\n", *uid, *ngrps, gidv, gids); for (i = 0; i < *ngrps; i++) *gidp++ = *gidv++; } } printf("aft server upcall stat=%d flags=0x%x\n", stat, res.flags); CLNT_RELEASE(cl); /* Once the upcall is done, the daemon is done with the fp and so. */ mtx_lock(&rpctls_server_lock); rpctls_server_so = NULL; rpctls_server_busy = false; wakeup(&rpctls_server_busy); mtx_unlock(&rpctls_server_lock); printf("aft wakeup\n"); return (stat); } /* * Handle the NULL RPC with authentication flavor of AUTH_TLS. * This is a STARTTLS command, so do the upcall to the rpctlssd daemon, * which will do the TLS handshake. */ enum auth_stat _svcauth_rpcsec_tls(struct svc_req *rqst, struct rpc_msg *msg) { bool_t call_stat; enum clnt_stat stat; SVCXPRT *xprt; uint32_t flags; uint64_t ssl[3]; int ngrps; uid_t uid; gid_t *gidp; /* Initialize reply. */ rqst->rq_verf = rpctls_null_verf; printf("authtls: clen=%d vlen=%d fl=%d\n", rqst->rq_cred.oa_length, msg->rm_call.cb_verf.oa_length, msg->rm_call.cb_verf.oa_flavor); /* Check client credentials. */ if (rqst->rq_cred.oa_length != 0 || msg->rm_call.cb_verf.oa_length != 0 || msg->rm_call.cb_verf.oa_flavor != AUTH_NULL) return (AUTH_BADCRED); printf("authtls proc=%d\n", rqst->rq_proc); if (rqst->rq_proc != NULLPROC) return (AUTH_REJECTEDCRED); if (PMAP_HAS_DMAP == 0) return (AUTH_REJECTEDCRED); #ifndef KERN_TLS return (AUTH_REJECTEDCRED); #endif /* * Disable reception for the krpc so that the TLS handshake can * be done on the socket in the rpctlssd daemon. */ xprt = rqst->rq_xprt; sx_xlock(&xprt->xp_lock); xprt->xp_dontrcv = TRUE; sx_xunlock(&xprt->xp_lock); /* * Send the reply to the NULL RPC with AUTH_TLS, which is the * STARTTLS command for Sun RPC. */ call_stat = svc_sendreply(rqst, (xdrproc_t)xdr_void, NULL); printf("authtls: null reply=%d\n", call_stat); if (!call_stat) { sx_xlock(&xprt->xp_lock); xprt->xp_dontrcv = FALSE; sx_xunlock(&xprt->xp_lock); xprt_active(xprt); /* Harmless if already active. */ return (AUTH_REJECTEDCRED); } /* Do an upcall to do the TLS handshake. */ stat = rpctls_server(rqst->rq_xprt->xp_socket, &flags, ssl, &uid, &ngrps, &gidp); /* Re-enable reception on the socket within the krpc. */ sx_xlock(&xprt->xp_lock); xprt->xp_dontrcv = FALSE; if (stat == RPC_SUCCESS) { xprt->xp_tls = flags; xprt->xp_sslsec = ssl[0]; xprt->xp_sslusec = ssl[1]; xprt->xp_sslrefno = ssl[2]; if ((flags & (RPCTLS_FLAGS_CERTUSER | RPCTLS_FLAGS_DISABLED)) == RPCTLS_FLAGS_CERTUSER) { xprt->xp_ngrps = ngrps; xprt->xp_uid = uid; xprt->xp_gidp = gidp; printf("got uid=%d ngrps=%d gidp=%p\n", uid, ngrps, gidp); } } sx_xunlock(&xprt->xp_lock); xprt_active(xprt); /* Harmless if already active. */ printf("authtls: aft handshake stat=%d\n", stat); return (RPCSEC_GSS_NODISPATCH); } /* * Get kern.ipc.tls.enable and kern.ipc.tls.maxlen. */ bool rpctls_getinfo(u_int *maxlenp) { u_int maxlen; bool enable; int error; size_t siz; siz = sizeof(enable); error = kernel_sysctlbyname(curthread, "kern.ipc.tls.enable", &enable, &siz, NULL, 0, NULL, 0); if (error != 0) return (false); siz = sizeof(maxlen); error = kernel_sysctlbyname(curthread, "kern.ipc.tls.maxlen", &maxlen, &siz, NULL, 0, NULL, 0); if (error != 0) return (false); *maxlenp = maxlen; return (enable); } Index: projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlscd.x =================================================================== --- projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlscd.x (revision 361061) +++ projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlscd.x (revision 361062) @@ -1,53 +1,61 @@ /*- * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson * Developed with Red Inc: Alfred Perlstein * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Modified from gssd.x for the client side of RPC-over-TLS. */ /* $FreeBSD:$ */ struct rpctlscd_connect_res { uint64_t sec; uint64_t usec; uint64_t ssl; }; +struct rpctlscd_handlerecord_arg { + uint64_t sec; + uint64_t usec; + uint64_t ssl; +}; + struct rpctlscd_disconnect_arg { uint64_t sec; uint64_t usec; uint64_t ssl; }; program RPCTLSCD { version RPCTLSCDVERS { void RPCTLSCD_NULL(void) = 0; rpctlscd_connect_res RPCTLSCD_CONNECT(void) = 1; - void RPCTLSCD_DISCONNECT(rpctlscd_disconnect_arg) = 2; + void RPCTLSCD_HANDLERECORD(rpctlscd_handlerecord_arg) = 2; + + void RPCTLSCD_DISCONNECT(rpctlscd_disconnect_arg) = 3; } = 1; } = 0x40677374; Index: projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlssd.x =================================================================== --- projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlssd.x (revision 361061) +++ projects/nfs-over-tls/sys/rpc/rpcsec_tls/rpctlssd.x (revision 361062) @@ -1,56 +1,64 @@ /*- * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson * Developed with Red Inc: Alfred Perlstein * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Modified from gssd.x for the server side of RPC-over-TLS. */ /* $FreeBSD$ */ struct rpctlssd_connect_res { uint32_t flags; uint64_t sec; uint64_t usec; uint64_t ssl; uint32_t uid; uint32_t gid<>; }; +struct rpctlssd_handlerecord_arg { + uint64_t sec; + uint64_t usec; + uint64_t ssl; +}; + struct rpctlssd_disconnect_arg { uint64_t sec; uint64_t usec; uint64_t ssl; }; program RPCTLSSD { version RPCTLSSDVERS { void RPCTLSSD_NULL(void) = 0; rpctlssd_connect_res RPCTLSSD_CONNECT(void) = 1; - void RPCTLSSD_DISCONNECT(rpctlssd_disconnect_arg) = 2; + void RPCTLSSD_HANDLERECORD(rpctlssd_handlerecord_arg) = 2; + + void RPCTLSSD_DISCONNECT(rpctlssd_disconnect_arg) = 3; } = 1; } = 0x40677375; Index: projects/nfs-over-tls/sys/rpc/rpcsec_tls.h =================================================================== --- projects/nfs-over-tls/sys/rpc/rpcsec_tls.h (revision 361061) +++ projects/nfs-over-tls/sys/rpc/rpcsec_tls.h (revision 361062) @@ -1,65 +1,69 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Rick Macklem * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _RPCTLS_IMPL_H #define _RPCTLS_IMPL_H /* Operation values for rpctls syscall. */ #define RPCTLS_SYSC_SETPATH 1 #define RPCTLS_SYSC_CONNECT 2 #define RPCTLS_SYSC_SERVER 3 /* Flag bits to indicate certificate results. */ #define RPCTLS_FLAGS_HANDSHAKE 0x01 #define RPCTLS_FLAGS_GOTCERT 0x02 #define RPCTLS_FLAGS_SELFSIGNED 0x04 #define RPCTLS_FLAGS_VERIFIED 0x08 #define RPCTLS_FLAGS_DISABLED 0x10 #define RPCTLS_FLAGS_CERTUSER 0x20 #ifdef _KERNEL /* Functions that perform upcalls to the rpctlsd daemon. */ enum clnt_stat rpctls_connect(CLIENT *newclient, struct socket *so, uint64_t *sslp); +enum clnt_stat rpctls_cl_handlerecord(uint64_t sec, uint64_t usec, + uint64_t ssl); +enum clnt_stat rpctls_srv_handlerecord(uint64_t sec, uint64_t usec, + uint64_t ssl); enum clnt_stat rpctls_cl_disconnect(uint64_t sec, uint64_t usec, uint64_t ssl); enum clnt_stat rpctls_srv_disconnect(uint64_t sec, uint64_t usec, uint64_t ssl); /* Initialization function for rpcsec_tls. */ int rpctls_init(void); /* Get TLS information function. */ bool rpctls_getinfo(u_int *maxlen); /* String for AUTH_TLS reply verifier. */ #define RPCTLS_START_STRING "STARTTLS" #endif /* _KERNEL */ #endif /* _RPCTLS_IMPL_H */ Index: projects/nfs-over-tls/sys/rpc/svc_vc.c =================================================================== --- projects/nfs-over-tls/sys/rpc/svc_vc.c (revision 361061) +++ projects/nfs-over-tls/sys/rpc/svc_vc.c (revision 361062) @@ -1,1080 +1,1114 @@ /* $NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro"; static char *sccsid = "@(#)svc_tcp.c 2.2 88/08/01 4.0 RPCSRC"; #endif #include __FBSDID("$FreeBSD$"); /* * svc_vc.c, Server side for Connection Oriented based RPC. * * Actually implements two flavors of transporter - * a tcp rendezvouser (a listner and connection establisher) * and a record/tcp stream. */ #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *); static void svc_vc_rendezvous_destroy(SVCXPRT *); static bool_t svc_vc_null(void); static void svc_vc_destroy(SVCXPRT *); static enum xprt_stat svc_vc_stat(SVCXPRT *); static bool_t svc_vc_ack(SVCXPRT *, uint32_t *); static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *seq); static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in); static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq, void *in); static void svc_vc_backchannel_destroy(SVCXPRT *); static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *); static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *); static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in); static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, .xp_stat = svc_vc_rendezvous_stat, .xp_reply = (bool_t (*)(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null, .xp_destroy = svc_vc_rendezvous_destroy, .xp_control = svc_vc_rendezvous_control }; static struct xp_ops svc_vc_ops = { .xp_recv = svc_vc_recv, .xp_stat = svc_vc_stat, .xp_ack = svc_vc_ack, .xp_reply = svc_vc_reply, .xp_destroy = svc_vc_destroy, .xp_control = svc_vc_control }; static struct xp_ops svc_vc_backchannel_ops = { .xp_recv = svc_vc_backchannel_recv, .xp_stat = svc_vc_backchannel_stat, .xp_reply = svc_vc_backchannel_reply, .xp_destroy = svc_vc_backchannel_destroy, .xp_control = svc_vc_backchannel_control }; /* * Usage: * xprt = svc_vc_create(sock, send_buf_size, recv_buf_size); * * Creates, registers, and returns a (rpc) tcp based transporter. * Once *xprt is initialized, it is registered as a transporter * see (svc.h, xprt_register). This routine returns * a NULL if a problem occurred. * * The filedescriptor passed in is expected to refer to a bound, but * not yet connected socket. * * Since streams do buffered io similar to stdio, the caller can specify * how big the send and receive buffers are via the second and third parms; * 0 => use the system default. */ SVCXPRT * svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, size_t recvsize) { SVCXPRT *xprt; struct sockaddr* sa; int error; SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) { SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); CURVNET_RESTORE(); if (error) return (NULL); xprt = svc_vc_create_conn(pool, so, sa); free(sa, M_SONAME); return (xprt); } SOCK_UNLOCK(so); xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = NULL; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_rendezvous_ops; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) { goto cleanup_svc_vc_create; } memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); solisten(so, -1, curthread); SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); SOLISTEN_UNLOCK(so); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); return (NULL); } /* * Create a new transport for a socket optained via soaccept(). */ SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { SVCXPRT *xprt; struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; int error; bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } } cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_ops; /* * See http://www.connectathon.org/talks96/nfstcp.pdf - client * has a 5 minute timer, server has a 6 minute timer. */ xprt->xp_idletimeout = 6 * 60; memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) goto cleanup_svc_vc_create; memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); SOCKBUF_LOCK(&so->so_rcv); xprt->xp_upcallset = 1; soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); SOCKBUF_UNLOCK(&so->so_rcv); /* * Throw the transport into the active list in case it already * has some data buffered. */ sx_xlock(&xprt->xp_lock); xprt_active(xprt); sx_xunlock(&xprt->xp_lock); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); mem_free(cd, sizeof(*cd)); return (NULL); } /* * Create a new transport for a backchannel on a clnt_vc socket. */ SVCXPRT * svc_vc_create_backchannel(SVCPOOL *pool) { SVCXPRT *xprt = NULL; struct cf_conn *cd = NULL; cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = NULL; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_backchannel_ops; return (xprt); } /* * This does all of the accept except the final call to soaccept. The * caller will call soaccept after dropping its locks (soaccept may * call malloc). */ int svc_vc_accept(struct socket *head, struct socket **sop) { struct socket *so; int error = 0; short nbio; /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(curthread->td_ucred, head); if (error != 0) goto done; #endif /* * XXXGL: we want non-blocking semantics. The socket could be a * socket created by kernel as well as socket shared with userland, * so we can't be sure about presense of SS_NBIO. We also shall not * toggle it on the socket, since that may surprise userland. So we * set SS_NBIO only temporarily. */ SOLISTEN_LOCK(head); nbio = head->so_state & SS_NBIO; head->so_state |= SS_NBIO; error = solisten_dequeue(head, &so, 0); head->so_state &= (nbio & ~SS_NBIO); if (error) goto done; so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } /*ARGSUSED*/ static bool_t svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct socket *so = NULL; struct sockaddr *sa = NULL; int error; SVCXPRT *new_xprt; /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to accept a * connection from the socket and turn it into a new * transport. If the accept fails, we have drained all pending * connections so we call xprt_inactive(). */ sx_xlock(&xprt->xp_lock); error = svc_vc_accept(xprt->xp_socket, &so); if (error == EWOULDBLOCK) { /* * We must re-test for new connections after taking * the lock to protect us in the case where a new * connection arrives after our call to accept fails * with EWOULDBLOCK. */ SOLISTEN_LOCK(xprt->xp_socket); if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); } sx_xunlock(&xprt->xp_lock); sa = NULL; error = soaccept(so, &sa); if (error) { /* * XXX not sure if I need to call sofree or soclose here. */ if (sa) free(sa, M_SONAME); return (FALSE); } /* * svc_vc_create_conn will call xprt_register - we don't need * to do anything with the new connection except derefence it. */ new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa); if (!new_xprt) { soclose(so); } else { SVC_RELEASE(new_xprt); } free(sa, M_SONAME); return (FALSE); /* there is never an rpc msg to be processed */ } /*ARGSUSED*/ static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *xprt) { return (XPRT_IDLE); } static void svc_vc_destroy_common(SVCXPRT *xprt) { enum clnt_stat stat; if (xprt->xp_socket) { stat = RPC_FAILED; if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) stat = rpctls_srv_disconnect(xprt->xp_sslsec, xprt->xp_sslusec, xprt->xp_sslrefno); if (stat != RPC_SUCCESS) (void)soclose(xprt->xp_socket); } if (xprt->xp_netid) (void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1); svc_xprt_free(xprt); } static void svc_vc_rendezvous_destroy(SVCXPRT *xprt) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; solisten_upcall_set(xprt->xp_socket, NULL, NULL); } SOLISTEN_UNLOCK(xprt->xp_socket); svc_vc_destroy_common(xprt); } static void svc_vc_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); svc_vc_destroy_common(xprt); if (cd->mreq) m_freem(cd->mreq); if (cd->mpending) m_freem(cd->mpending); mem_free(cd, sizeof(*cd)); } static void svc_vc_backchannel_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; struct mbuf *m, *m2; svc_xprt_free(xprt); m = cd->mreq; while (m != NULL) { m2 = m; m = m->m_nextpkt; m_freem(m2); } mem_free(cd, sizeof(*cd)); } /*ARGSUSED*/ static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static enum xprt_stat svc_vc_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->strm_stat == XPRT_DIED) return (XPRT_DIED); if (cd->mreq != NULL && cd->resid == 0 && cd->eor) return (XPRT_MOREREQS); if (soreadable(xprt->xp_socket)) return (XPRT_MOREREQS); return (XPRT_IDLE); } static bool_t svc_vc_ack(SVCXPRT *xprt, uint32_t *ack) { *ack = atomic_load_acq_32(&xprt->xp_snt_cnt); *ack -= sbused(&xprt->xp_socket->so_snd); return (TRUE); } static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->mreq != NULL) return (XPRT_MOREREQS); return (XPRT_IDLE); } /* * If we have an mbuf chain in cd->mpending, try to parse a record from it, * leaving the result in cd->mreq. If we don't have a complete record, leave * the partial result in cd->mreq and try to read more from the socket. */ static int svc_vc_process_pending(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct socket *so = xprt->xp_socket; struct mbuf *m; /* * If cd->resid is non-zero, we have part of the * record already, otherwise we are expecting a record * marker. */ if (!cd->resid && cd->mpending) { /* * See if there is enough data buffered to * make up a record marker. Make sure we can * handle the case where the record marker is * split across more than one mbuf. */ size_t n = 0; uint32_t header; m = cd->mpending; while (n < sizeof(uint32_t) && m) { n += m->m_len; m = m->m_next; } if (n < sizeof(uint32_t)) { so->so_rcv.sb_lowat = sizeof(uint32_t) - n; return (FALSE); } m_copydata(cd->mpending, 0, sizeof(header), (char *)&header); header = ntohl(header); cd->eor = (header & 0x80000000) != 0; cd->resid = header & 0x7fffffff; m_adj(cd->mpending, sizeof(uint32_t)); } /* * Start pulling off mbufs from cd->mpending * until we either have a complete record or * we run out of data. We use m_split to pull * data - it will pull as much as possible and * split the last mbuf if necessary. */ while (cd->mpending && cd->resid) { m = cd->mpending; if (cd->mpending->m_next || cd->mpending->m_len > cd->resid) cd->mpending = m_split(cd->mpending, cd->resid, M_WAITOK); else cd->mpending = NULL; if (cd->mreq) m_last(cd->mreq)->m_next = m; else cd->mreq = m; while (m) { cd->resid -= m->m_len; m = m->m_next; } } /* * Block receive upcalls if we have more data pending, * otherwise report our need. */ if (cd->mpending) so->so_rcv.sb_lowat = INT_MAX; else so->so_rcv.sb_lowat = imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2)); return (TRUE); } static bool_t svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct uio uio; struct mbuf *m, *ctrl; struct socket* so = xprt->xp_socket; XDR xdrs; int error, rcvflag; uint32_t xid_plus_direction[2]; struct cmsghdr *cmsg; struct tls_get_record tgr; + enum clnt_stat ret; /* * Serialise access to the socket and our own record parsing * state. */ sx_xlock(&xprt->xp_lock); for (;;) { /* If we have no request ready, check pending queue. */ while (cd->mpending && (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) { if (!svc_vc_process_pending(xprt)) break; } /* Process and return complete request in cd->mreq. */ if (cd->mreq != NULL && cd->resid == 0 && cd->eor) { /* * Now, check for a backchannel reply. * The XID is in the first uint32_t of the reply * and the message direction is the second one. */ if ((cd->mreq->m_len >= sizeof(xid_plus_direction) || m_length(cd->mreq, NULL) >= sizeof(xid_plus_direction)) && xprt->xp_p2 != NULL) { m_copydata(cd->mreq, 0, sizeof(xid_plus_direction), (char *)xid_plus_direction); xid_plus_direction[0] = ntohl(xid_plus_direction[0]); xid_plus_direction[1] = ntohl(xid_plus_direction[1]); /* Check message direction. */ if (xid_plus_direction[1] == REPLY) { clnt_bck_svccall(xprt->xp_p2, cd->mreq, xid_plus_direction[0]); cd->mreq = NULL; continue; } } xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE); cd->mreq = NULL; /* Check for next request in a pending queue. */ svc_vc_process_pending(xprt); if (cd->mreq == NULL || cd->resid != 0) { SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); } sx_xunlock(&xprt->xp_lock); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } /* * If receiving is disabled so that a TLS handshake can be * done by the rpctlssd daemon, return FALSE here. */ + rcvflag = MSG_DONTWAIT; + if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) + rcvflag |= MSG_TLSAPPDATA; tryagain: if (xprt->xp_dontrcv) { sx_xunlock(&xprt->xp_lock); return (FALSE); } /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to * read as much as possible from the socket and put * the result in cd->mpending. If the read fails, * we have drained both cd->mpending and the socket so * we can call xprt_inactive(). */ uio.uio_resid = 1000000000; uio.uio_td = curthread; ctrl = m = NULL; - rcvflag = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &m, &ctrl, &rcvflag); if (error == EWOULDBLOCK) { /* * We must re-test for readability after * taking the lock to protect us in the case * where a new packet arrives on the socket * after our call to soreceive fails with * EWOULDBLOCK. */ SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); sx_xunlock(&xprt->xp_lock); return (FALSE); } + /* + * A return of ENXIO indicates that there is a + * non-application data record at the head of the + * socket's receive queue, for TLS connections. + * This record needs to be handled in userland + * via an SSL_read() call, so do an upcall to the daemon. + */ + if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0 && + error == ENXIO) { + /* Disable reception. */ + xprt->xp_dontrcv = TRUE; + sx_xunlock(&xprt->xp_lock); +printf("Call rpctls_srv_handlerecord\n"); + ret = rpctls_srv_handlerecord(xprt->xp_sslsec, + xprt->xp_sslusec, xprt->xp_sslrefno); + sx_xlock(&xprt->xp_lock); + xprt->xp_dontrcv = FALSE; + if (ret != RPC_SUCCESS) { + /* + * All we can do is soreceive() it and + * then toss it. + */ + rcvflag = MSG_DONTWAIT; + goto tryagain; + } + sx_xunlock(&xprt->xp_lock); + xprt_active(xprt); /* Harmless if already active. */ + return (FALSE); + } + if (error) { SOCKBUF_LOCK(&so->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(so, SO_RCV); } SOCKBUF_UNLOCK(&so->so_rcv); xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (!m) { /* * EOF - the other end has closed the socket. */ xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } /* Process any record header(s). */ if (ctrl != NULL) { if (ctrl->m_next != NULL) printf("EEK! svc list of controls\n"); cmsg = mtod(ctrl, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* - * For now, just toss non-application - * data records. - * In the future, there may need to be - * an upcall done to the daemon. + * This should have been handled by + * the rpctls_svc_handlerecord() + * upcall. If not, all we can do is + * toss it away. */ if (tgr.tls_type != TLS_RLTYPE_APP) { printf("Got weird type=%d\n", tgr.tls_type); m_freem(m); m_free(ctrl); + rcvflag = MSG_DONTWAIT | MSG_TLSAPPDATA; goto tryagain; } } m_free(ctrl); } if (cd->mpending) m_last(cd->mpending)->m_next = m; else cd->mpending = m; } } static bool_t svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct ct_data *ct; struct mbuf *m; XDR xdrs; sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct == NULL) { sx_xunlock(&xprt->xp_lock); return (FALSE); } mtx_lock(&ct->ct_lock); m = cd->mreq; if (m == NULL) { xprt_inactive_self(xprt); mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); return (FALSE); } cd->mreq = m->m_nextpkt; mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); printf("recv backch m=%p\n", m); xdrmbuf_create(&xdrs, m, XDR_DECODE); if (! xdr_callmsg(&xdrs, msg)) { printf("recv backch callmsg failed\n"); XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } static bool_t svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, len, maxextsiz; #ifdef KERN_TLS u_int maxlen; #endif /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); len = mrep->m_pkthdr.len; *mtod(mrep, uint32_t *) = htonl(0x80000000 | (len - sizeof(uint32_t))); /* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */ if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) { /* * Copy the mbuf chain to a chain of * ext_pgs mbuf(s) as required by KERN_TLS. */ maxextsiz = TLS_MAX_MSG_SIZE_V10_2; #ifdef KERN_TLS if (rpctls_getinfo(&maxlen)) maxextsiz = min(maxextsiz, maxlen); #endif mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz); } atomic_add_32(&xprt->xp_snd_cnt, len); /* * sosend consumes mreq. */ error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL, 0, curthread); if (!error) { atomic_add_rel_32(&xprt->xp_snt_cnt, len); if (seq) *seq = xprt->xp_snd_cnt; stat = TRUE; } else atomic_subtract_32(&xprt->xp_snd_cnt, len); } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { struct ct_data *ct; XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, maxextsiz; #ifdef KERN_TLS u_int maxlen; #endif /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); *mtod(mrep, uint32_t *) = htonl(0x80000000 | (mrep->m_pkthdr.len - sizeof(uint32_t))); /* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */ if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) { /* * Copy the mbuf chain to a chain of * ext_pgs mbuf(s) as required by KERN_TLS. */ maxextsiz = TLS_MAX_MSG_SIZE_V10_2; #ifdef KERN_TLS if (rpctls_getinfo(&maxlen)) maxextsiz = min(maxextsiz, maxlen); #endif mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz); } sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct != NULL) error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL, 0, curthread); else error = EPIPE; sx_xunlock(&xprt->xp_lock); if (!error) { stat = TRUE; } } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_null() { return (FALSE); } static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) xprt_active(xprt); return (SU_OK); } static int svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv * and rpc.yppasswdd on AF_LOCAL. */ int __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) { int sock, ret; gid_t egid; uid_t euid; struct sockaddr *sa; sock = transp->xp_fd; sa = (struct sockaddr *)transp->xp_rtaddr; if (sa->sa_family == AF_LOCAL) { ret = getpeereid(sock, &euid, &egid); if (ret == 0) *uid = euid; return (ret); } else return (-1); } #endif Index: projects/nfs-over-tls/usr.sbin/rpctlscd/rpctlscd.c =================================================================== --- projects/nfs-over-tls/usr.sbin/rpctlscd/rpctlscd.c (revision 361061) +++ projects/nfs-over-tls/usr.sbin/rpctlscd/rpctlscd.c (revision 361062) @@ -1,710 +1,754 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson * Developed with Red Inc: Alfred Perlstein * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Modified from gssd.c for the client side of kernel RPC-over-TLS. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rpctlscd.h" #ifndef _PATH_RPCTLSCDSOCK #define _PATH_RPCTLSCDSOCK "/var/run/rpctlscd.sock" #endif #ifndef _PATH_CERTANDKEY #define _PATH_CERTANDKEY "/etc/rpctlscd/" #endif #ifndef _PATH_RPCTLSCDPID #define _PATH_RPCTLSCDPID "/var/run/rpctlscd.pid" #endif #ifndef _PREFERRED_CIPHERS #define _PREFERRED_CIPHERS "AES128-GCM-SHA256" #endif static struct pidfh *rpctls_pfh = NULL; static int rpctls_debug_level; static bool rpctls_verbose; static int testnossl; static SSL_CTX *rpctls_ctx = NULL; static const char *rpctls_verify_cafile = NULL; static const char *rpctls_verify_capath = NULL; static const char *rpctls_crlfile = NULL; static const char *rpctls_certdir = _PATH_CERTANDKEY; static uint64_t rpctls_ssl_refno = 0; static uint64_t rpctls_ssl_sec = 0; static uint64_t rpctls_ssl_usec = 0; static bool rpctls_gothup = false; /* * A linked list of all current "SSL *"s and socket "fd"s * for kernel RPC TLS connections is maintained. * The "refno" field is a unique 64bit value used to * identify which entry a kernel RPC upcall refers to. */ LIST_HEAD(ssl_list, ssl_entry); struct ssl_entry { LIST_ENTRY(ssl_entry) next; uint64_t refno; int s; SSL *ssl; }; static struct ssl_list rpctls_ssllist; static void rpctlscd_terminate(int); static SSL_CTX *rpctls_setupcl_ssl(bool cert); static SSL *rpctls_connect(SSL_CTX *ctx, int s); static int rpctls_gethost(int s, struct sockaddr *sad, char *hostip, size_t hostlen); static int rpctls_checkhost(struct sockaddr *sad, X509 *cert); static int rpctls_loadcrlfile(SSL_CTX *ctx); static void rpctls_huphandler(int sig __unused); extern void rpctlscd_1(struct svc_req *rqstp, SVCXPRT *transp); extern int gssd_syscall(const char *path); int main(int argc, char **argv) { /* * We provide an RPC service on a local-domain socket. The * kernel rpctls code will upcall to this daemon to do the initial * TLS handshake. */ struct sockaddr_un sun; int fd, oldmask, ch; SVCXPRT *xprt; bool cert; struct timeval tm; struct timezone tz; pid_t otherpid; /* Check that another rpctlscd isn't already running. */ rpctls_pfh = pidfile_open(_PATH_RPCTLSCDPID, 0600, &otherpid); if (rpctls_pfh == NULL) { if (errno == EEXIST) errx(1, "rpctlscd already running, pid: %d.", otherpid); warn("cannot open or create pidfile"); } if (modfind("ktls_ocf") < 0) { /* Not present in kernel, try loading it */ if (kldload("ktls_ocf") < 0 || modfind("ktls_ocf") < 0) errx(1, "Cannot load ktls_ocf"); } /* Get the time when this daemon is started. */ gettimeofday(&tm, &tz); rpctls_ssl_sec = tm.tv_sec; rpctls_ssl_usec = tm.tv_usec; rpctls_verbose = false; testnossl = 0; cert = false; while ((ch = getopt(argc, argv, "D:dl:mp:r:tv")) != -1) { switch (ch) { case 'D': rpctls_certdir = optarg; break; case 'd': rpctls_debug_level++; break; case 'l': rpctls_verify_cafile = optarg; break; case 'm': cert = true; break; case 'p': rpctls_verify_capath = optarg; break; case 'r': rpctls_crlfile = optarg; break; case 't': testnossl = 1; break; case 'v': rpctls_verbose = true; break; default: fprintf(stderr, "usage: %s " "[-D certdir] [-d] " "[-l CAfile] [-m] " "[-p CApath] [-r CRLfile] " "[-v]\n", argv[0]); exit(1); break; } } if (rpctls_crlfile != NULL && rpctls_verify_cafile == NULL && rpctls_verify_capath == NULL) errx(1, "-r requires the -l and/or " "-p options"); if (modfind("krpc") < 0) { /* Not present in kernel, try loading it */ if (kldload("krpc") < 0 || modfind("krpc") < 0) errx(1, "Kernel RPC is not available"); } /* * Set up the SSL_CTX *. * Do it now, before daemonizing, in case the private key * is encrypted and requires a passphrase to be entered. */ rpctls_ctx = rpctls_setupcl_ssl(cert); if (rpctls_ctx == NULL) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't set up TSL context"); exit(1); } err(1, "Can't set up TSL context"); } LIST_INIT(&rpctls_ssllist); if (!rpctls_debug_level) { if (daemon(0, 0) != 0) err(1, "Can't daemonize"); signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); signal(SIGHUP, SIG_IGN); } signal(SIGTERM, rpctlscd_terminate); signal(SIGPIPE, rpctlscd_terminate); signal(SIGHUP, rpctls_huphandler); pidfile_write(rpctls_pfh); memset(&sun, 0, sizeof sun); sun.sun_family = AF_LOCAL; unlink(_PATH_RPCTLSCDSOCK); strcpy(sun.sun_path, _PATH_RPCTLSCDSOCK); sun.sun_len = SUN_LEN(&sun); fd = socket(AF_LOCAL, SOCK_STREAM, 0); if (fd < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't create local rpctlscd socket"); exit(1); } err(1, "Can't create local rpctlscd socket"); } oldmask = umask(S_IXUSR|S_IRWXG|S_IRWXO); if (bind(fd, (struct sockaddr *)&sun, sun.sun_len) < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't bind local rpctlscd socket"); exit(1); } err(1, "Can't bind local rpctlscd socket"); } umask(oldmask); if (listen(fd, SOMAXCONN) < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't listen on local rpctlscd socket"); exit(1); } err(1, "Can't listen on local rpctlscd socket"); } xprt = svc_vc_create(fd, RPC_MAXDATASIZE, RPC_MAXDATASIZE); if (!xprt) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't create transport for local rpctlscd socket"); exit(1); } err(1, "Can't create transport for local rpctlscd socket"); } if (!svc_reg(xprt, RPCTLSCD, RPCTLSCDVERS, rpctlscd_1, NULL)) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't register service for local rpctlscd socket"); exit(1); } err(1, "Can't register service for local rpctlscd socket"); } gssd_syscall(_PATH_RPCTLSCDSOCK); svc_run(); gssd_syscall(""); SSL_CTX_free(rpctls_ctx); EVP_cleanup(); return (0); } static void rpctlscd_verbose_out(const char *fmt, ...) { va_list ap; if (rpctls_verbose) { va_start(ap, fmt); if (rpctls_debug_level == 0) vsyslog(LOG_INFO | LOG_DAEMON, fmt, ap); else vfprintf(stderr, fmt, ap); va_end(ap); } } bool_t rpctlscd_null_1_svc(void *argp, void *result, struct svc_req *rqstp) { rpctlscd_verbose_out("rpctlscd_null: done\n"); return (TRUE); } bool_t rpctlscd_connect_1_svc(void *argp, struct rpctlscd_connect_res *result, struct svc_req *rqstp) { int s; bool_t res; SSL *ssl; char buf[1024]; ssize_t siz, ret; struct ssl_entry *newslp; rpctlscd_verbose_out("rpctlsd_connect: started\n"); /* Get the socket fd from the kernel. */ s = gssd_syscall("C"); rpctlscd_verbose_out("rpctlsd_connect s=%d\n", s); if (s < 0) return (FALSE); /* Do a TLS connect handshake. */ ssl = rpctls_connect(rpctls_ctx, s); if (ssl == NULL) rpctlscd_verbose_out("rpctlsd_connect: can't do TLS " "handshake\n"); else { result->sec = rpctls_ssl_sec; result->usec = rpctls_ssl_usec; result->ssl = ++rpctls_ssl_refno; /* Hard to believe this will ever wrap around.. */ if (rpctls_ssl_refno == 0) result->ssl = ++rpctls_ssl_refno; } if (testnossl != 0 && ssl != NULL) { /* Read the 478 bytes of junk off the socket. */ siz = 478; ret = 1; while (siz > 0 && ret > 0) { ret = recv(s, &buf[478 - siz], siz, 0); siz -= ret; } } if (ssl == NULL) { /* * For RPC-over-TLS, this upcall is expected * to close off the socket. */ shutdown(s, SHUT_WR); close(s); return (FALSE); } /* Maintain list of all current SSL *'s */ newslp = malloc(sizeof(*newslp)); newslp->refno = rpctls_ssl_refno; newslp->s = s; newslp->ssl = ssl; LIST_INSERT_HEAD(&rpctls_ssllist, newslp, next); return (TRUE); } bool_t +rpctlscd_handlerecord_1_svc(struct rpctlscd_handlerecord_arg *argp, + void *result, struct svc_req *rqstp) +{ + struct ssl_entry *slp; + int ret; + char junk; + + slp = NULL; + if (argp->sec == rpctls_ssl_sec && argp->usec == + rpctls_ssl_usec) { + LIST_FOREACH(slp, &rpctls_ssllist, next) { + if (slp->refno == argp->ssl) + break; + } + } + + if (slp != NULL) { + rpctlscd_verbose_out("rpctlscd_handlerecord fd=%d\n", + slp->s); + /* + * An SSL_read() of 0 bytes should fail, but it should + * handle the non-application data record before doing so. + */ + ret = SSL_read(slp->ssl, &junk, 0); + if (ret > 0) { + if (rpctls_debug_level == 0) + syslog(LOG_ERR, "SSL_read returned %d", ret); + else + fprintf(stderr, "SSL_read returned %d\n", ret); + } + } else + return (FALSE); + return (TRUE); +} + +bool_t rpctlscd_disconnect_1_svc(struct rpctlscd_disconnect_arg *argp, void *result, struct svc_req *rqstp) { struct ssl_entry *slp; + int ret; slp = NULL; if (argp->sec == rpctls_ssl_sec && argp->usec == rpctls_ssl_usec) { LIST_FOREACH(slp, &rpctls_ssllist, next) { if (slp->refno == argp->ssl) break; } } if (slp != NULL) { rpctlscd_verbose_out("rpctlscd_disconnect: fd=%d closed\n", slp->s); LIST_REMOVE(slp, next); + SSL_shutdown(slp->ssl); + /* Check to see if the peer has sent a close alert. */ + ret = SSL_get_shutdown(slp->ssl); +rpctlscd_verbose_out("get_shutdown=%d\n", ret); + if ((ret & (SSL_SENT_SHUTDOWN | SSL_RECEIVED_SHUTDOWN)) == + SSL_SENT_SHUTDOWN) + SSL_shutdown(slp->ssl); SSL_free(slp->ssl); /* * For RPC-over-TLS, this upcall is expected * to close off the socket. */ shutdown(slp->s, SHUT_WR); close(slp->s); free(slp); } else return (FALSE); return (TRUE); } int rpctlscd_1_freeresult(SVCXPRT *transp, xdrproc_t xdr_result, caddr_t result) { return (TRUE); } static void rpctlscd_terminate(int sig __unused) { gssd_syscall(""); pidfile_remove(rpctls_pfh); exit(0); } static SSL_CTX * rpctls_setupcl_ssl(bool cert) { SSL_CTX *ctx; long flags; char path[PATH_MAX]; size_t len, rlen; int ret; SSL_library_init(); SSL_load_error_strings(); OpenSSL_add_all_algorithms(); ctx = SSL_CTX_new(TLS_client_method()); if (ctx == NULL) { rpctlscd_verbose_out("rpctls_setupcl_ssl: SSL_CTX_new " "failed\n"); return (NULL); } SSL_CTX_set_ecdh_auto(ctx, 1); /* * Set preferred ciphers, since KERN_TLS only supports a * few of them. */ ret = SSL_CTX_set_cipher_list(ctx, _PREFERRED_CIPHERS); if (ret == 0) { rpctlscd_verbose_out("rpctls_setupcl_ssl: " "SSL_CTX_set_cipher_list failed to set any ciphers\n"); SSL_CTX_free(ctx); return (NULL); } /* * If cert is true, a certificate and key exists in * rpctls_certdir, so that it can do mutual authentication. */ if (cert) { /* Get the cert.pem and key.pem files. */ len = strlcpy(path, rpctls_certdir, sizeof(path)); rlen = sizeof(path) - len; if (strlcpy(&path[len], "cert.pem", rlen) != 8) { SSL_CTX_free(ctx); return (NULL); } ret = SSL_CTX_use_certificate_file(ctx, path, SSL_FILETYPE_PEM); if (ret != 1) { rpctlscd_verbose_out("rpctls_setupcl_ssl: can't use " "certificate file path=%s ret=%d\n", path, ret); SSL_CTX_free(ctx); return (NULL); } if (strlcpy(&path[len], "key.pem", rlen) != 7) { SSL_CTX_free(ctx); return (NULL); } ret = SSL_CTX_use_PrivateKey_file(ctx, path, SSL_FILETYPE_PEM); if (ret != 1) { rpctlscd_verbose_out("rpctls_setupcl_ssl: Can't use " "private key path=%s ret=%d\n", path, ret); SSL_CTX_free(ctx); return (NULL); } } if (rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL) { if (rpctls_crlfile != NULL) { ret = rpctls_loadcrlfile(ctx); if (ret == 0) { rpctlscd_verbose_out("rpctls_setupcl_ssl: " "Load CRLfile failed\n"); SSL_CTX_free(ctx); return (NULL); } } ret = SSL_CTX_load_verify_locations(ctx, rpctls_verify_cafile, rpctls_verify_capath); if (ret != 1) { rpctlscd_verbose_out("rpctls_setupcl_ssl: " "Can't load verify locations\n"); SSL_CTX_free(ctx); return (NULL); } /* * The man page says that the * SSL_CTX_set0_CA_list() call is not normally * needed, but I believe it is harmless. */ if (rpctls_verify_cafile != NULL) SSL_CTX_set0_CA_list(ctx, SSL_load_client_CA_file(rpctls_verify_cafile)); } /* RPC-over-TLS must use TLSv1.3. */ #ifdef notyet flags = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1 | SSL_OP_NO_TLSv1_2; #else flags = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1_3; #endif SSL_CTX_set_options(ctx, flags); return (ctx); } static SSL * rpctls_connect(SSL_CTX *ctx, int s) { SSL *ssl; X509 *cert; struct sockaddr *sad; struct sockaddr_storage ad; char hostnam[NI_MAXHOST]; int gethostret, ret; char *cp, *cp2; if (rpctls_gothup) { rpctls_gothup = false; ret = rpctls_loadcrlfile(ctx); if (ret == 0) rpctlscd_verbose_out("rpctls_connect: Can't " "reload CRLfile\n"); } ssl = SSL_new(ctx); if (ssl == NULL) { rpctlscd_verbose_out("rpctls_connect: " "SSL_new failed\n"); return (NULL); } if (SSL_set_fd(ssl, s) != 1) { rpctlscd_verbose_out("rpctls_connect: " "SSL_set_fd failed\n"); SSL_free(ssl); return (NULL); } ret = SSL_connect(ssl); if (ret != 1) { rpctlscd_verbose_out("rpctls_connect: " "SSL_connect failed %d\n", ret); SSL_free(ssl); return (NULL); } cert = SSL_get_peer_certificate(ssl); if (cert == NULL) { rpctlscd_verbose_out("rpctls_connect: get peer" " certificate failed\n"); SSL_free(ssl); return (NULL); } gethostret = rpctls_gethost(s, sad, hostnam, sizeof(hostnam)); if (gethostret == 0) hostnam[0] = '\0'; ret = SSL_get_verify_result(ssl); if (ret == X509_V_OK && (rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL) && (gethostret == 0 || rpctls_checkhost(sad, cert) != 1)) ret = X509_V_ERR_HOSTNAME_MISMATCH; X509_free(cert); if (ret != X509_V_OK && (rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL)) { if (ret != X509_V_OK) { cp = X509_NAME_oneline(X509_get_issuer_name(cert), NULL, 0); cp2 = X509_NAME_oneline(X509_get_subject_name(cert), NULL, 0); if (rpctls_debug_level == 0) syslog(LOG_INFO | LOG_DAEMON, "rpctls_connect: client IP %s " "issuerName=%s subjectName=%s verify " "failed %s\n", hostnam, cp, cp2, X509_verify_cert_error_string(ret)); else fprintf(stderr, "rpctls_connect: client IP %s " "issuerName=%s subjectName=%s verify " "failed %s\n", hostnam, cp, cp2, X509_verify_cert_error_string(ret)); } SSL_free(ssl); return (NULL); } /* Check to see if ktls is enabled on the connection. */ ret = BIO_get_ktls_send(SSL_get_wbio(ssl)); rpctlscd_verbose_out("rpctls_connect: BIO_get_ktls_send=%d\n", ret); if (ret != 0) { ret = BIO_get_ktls_recv(SSL_get_rbio(ssl)); rpctlscd_verbose_out("rpctls_connect: BIO_get_ktls_recv=%d\n", ret); } if (ret == 0) { if (rpctls_debug_level == 0) syslog(LOG_ERR, "ktls not working\n"); else fprintf(stderr, "ktls not working\n"); SSL_free(ssl); return (NULL); } return (ssl); } /* * Get the server's IP address. */ static int rpctls_gethost(int s, struct sockaddr *sad, char *hostip, size_t hostlen) { socklen_t slen; int ret; slen = sizeof(struct sockaddr_storage); if (getpeername(s, sad, &slen) < 0) return (0); ret = 0; if (getnameinfo((const struct sockaddr *)sad, sad->sa_len, hostip, hostlen, NULL, 0, NI_NUMERICHOST) == 0) { rpctlscd_verbose_out("rpctls_gethost: %s\n", hostip); ret = 1; } return (ret); } /* * Check a server IP address against any host address in the * certificate. Basically getnameinfo(3) and * X509_check_host(). */ static int rpctls_checkhost(struct sockaddr *sad, X509 *cert) { char hostnam[NI_MAXHOST]; int ret; if (getnameinfo((const struct sockaddr *)sad, sad->sa_len, hostnam, sizeof(hostnam), NULL, 0, NI_NAMEREQD) != 0) return (0); rpctlscd_verbose_out("rpctls_checkhost: DNS %s\n", hostnam); ret = X509_check_host(cert, hostnam, strlen(hostnam), X509_CHECK_FLAG_NO_WILDCARDS, NULL); return (ret); } /* * (re)load the CRLfile into the certificate verification store. */ static int rpctls_loadcrlfile(SSL_CTX *ctx) { X509_STORE *certstore; X509_LOOKUP *certlookup; int ret; if ((rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL) && rpctls_crlfile != NULL) { certstore = SSL_CTX_get_cert_store(ctx); certlookup = X509_STORE_add_lookup( certstore, X509_LOOKUP_file()); ret = 0; if (certlookup != NULL) ret = X509_load_crl_file(certlookup, rpctls_crlfile, X509_FILETYPE_PEM); if (ret != 0) ret = X509_STORE_set_flags(certstore, X509_V_FLAG_CRL_CHECK | X509_V_FLAG_CRL_CHECK_ALL); if (ret == 0) { rpctlscd_verbose_out( "rpctls_loadcrlfile: Can't" " load CRLfile=%s\n", rpctls_crlfile); return (ret); } } return (1); } static void rpctls_huphandler(int sig __unused) { rpctls_gothup = true; } Index: projects/nfs-over-tls/usr.sbin/rpctlssd/rpctlssd.c =================================================================== --- projects/nfs-over-tls/usr.sbin/rpctlssd/rpctlssd.c (revision 361061) +++ projects/nfs-over-tls/usr.sbin/rpctlssd/rpctlssd.c (revision 361062) @@ -1,898 +1,942 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ * Authors: Doug Rabson * Developed with Red Inc: Alfred Perlstein * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Modified from gssd.c for the server side of kernel RPC-over-TLS. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rpctlssd.h" #ifndef _PATH_RPCTLSSDSOCK #define _PATH_RPCTLSSDSOCK "/var/run/rpctlssd.sock" #define _PATH_RPCTLSSDS "S/var/run/rpctlssd.sock" #endif #ifndef _PATH_CERTANDKEY #define _PATH_CERTANDKEY "/etc/rpctlssd/" #endif #ifndef _PATH_RPCTLSSDPID #define _PATH_RPCTLSSDPID "/var/run/rpctlssd.pid" #endif #ifndef _PREFERRED_CIPHERS #define _PREFERRED_CIPHERS "AES128-GCM-SHA256" #endif static struct pidfh *rpctls_pfh = NULL; static int rpctls_debug_level; static bool rpctls_verbose; static SSL_CTX *rpctls_ctx = NULL; static bool rpctls_do_mutual = false; static const char *rpctls_verify_cafile = NULL; static const char *rpctls_verify_capath = NULL; static const char *rpctls_crlfile = NULL; static const char *rpctls_certdir = _PATH_CERTANDKEY; static bool rpctls_comparehost = false; static unsigned int rpctls_wildcard = X509_CHECK_FLAG_NO_WILDCARDS; static uint64_t rpctls_ssl_refno = 0; static uint64_t rpctls_ssl_sec = 0; static uint64_t rpctls_ssl_usec = 0; static bool rpctls_gothup = false; static bool rpctls_cnuser = false; static char *rpctls_dnsname; static const char *rpctls_cnuseroid = "1.2.3.4.6.9"; /* * A linked list of all current "SSL *"s and socket "fd"s * for kernel RPC TLS connections is maintained. * The "refno" field is a unique 64bit value used to * identify which entry a kernel RPC upcall refers to. */ LIST_HEAD(ssl_list, ssl_entry); struct ssl_entry { LIST_ENTRY(ssl_entry) next; uint64_t refno; int s; SSL *ssl; }; static struct ssl_list rpctls_ssllist; static void rpctlssd_terminate(int); static SSL_CTX *rpctls_setup_ssl(const char *certdir); static SSL *rpctls_server(SSL_CTX *ctx, int s, uint32_t *flags, uint32_t *uidp, int *ngrps, uint32_t *gidp); static int rpctls_gethost(int s, struct sockaddr *sad, char *hostip, size_t hostlen); static int rpctls_checkhost(struct sockaddr *sad, X509 *cert); static int rpctls_loadcrlfile(SSL_CTX *ctx); static int rpctls_cnname(X509 *cert, uint32_t *uidp, int *ngrps, uint32_t *gidp); static char *rpctls_getdnsname(char *dnsname); static void rpctls_huphandler(int sig __unused); extern void rpctlssd_1(struct svc_req *rqstp, SVCXPRT *transp); extern int gssd_syscall(const char *path); int main(int argc, char **argv) { /* * We provide an RPC service on a local-domain socket. The * kernel rpctls code will upcall to this daemon to do the initial * TLS handshake. */ struct sockaddr_un sun; int fd, oldmask, ch, debug; SVCXPRT *xprt; struct timeval tm; struct timezone tz; char hostname[MAXHOSTNAMELEN + 2]; pid_t otherpid; /* Check that another rpctlssd isn't already running. */ rpctls_pfh = pidfile_open(_PATH_RPCTLSSDPID, 0600, &otherpid); if (rpctls_pfh == NULL) { if (errno == EEXIST) errx(1, "rpctlssd already running, pid: %d.", otherpid); warn("cannot open or create pidfile"); } if (modfind("ktls_ocf") < 0) { /* Not present in kernel, try loading it */ if (kldload("ktls_ocf") < 0 || modfind("ktls_ocf") < 0) errx(1, "Cannot load ktls_ocf"); } /* Get the time when this daemon is started. */ gettimeofday(&tm, &tz); rpctls_ssl_sec = tm.tv_sec; rpctls_ssl_usec = tm.tv_usec; /* Set the dns name for the server. */ rpctls_dnsname = rpctls_getdnsname(hostname); if (rpctls_dnsname == NULL) { strcpy(hostname, "@default.domain"); rpctls_dnsname = hostname; } fprintf(stderr, "dnsname=%s\n", rpctls_dnsname); debug = 0; rpctls_verbose = false; while ((ch = getopt(argc, argv, "D:dhl:n:mp:r:uvWw")) != -1) { switch (ch) { case 'D': rpctls_certdir = optarg; break; case 'd': rpctls_debug_level++; break; case 'h': rpctls_comparehost = true; break; case 'l': rpctls_verify_cafile = optarg; break; case 'm': rpctls_do_mutual = true; break; case 'n': hostname[0] = '@'; strlcpy(&hostname[1], optarg, MAXHOSTNAMELEN + 1); rpctls_dnsname = hostname; break; case 'p': rpctls_verify_capath = optarg; break; case 'r': rpctls_crlfile = optarg; break; case 'u': rpctls_cnuser = true; break; case 'v': rpctls_verbose = true; break; case 'W': if (rpctls_wildcard != X509_CHECK_FLAG_NO_WILDCARDS) errx(1, "options -w and -W are mutually " "exclusive"); rpctls_wildcard = X509_CHECK_FLAG_MULTI_LABEL_WILDCARDS; break; case 'w': if (rpctls_wildcard != X509_CHECK_FLAG_NO_WILDCARDS) errx(1, "options -w and -W are mutually " "exclusive"); rpctls_wildcard = 0; break; default: fprintf(stderr, "usage: %s " "[-D certdir] [-d] [-h] " "[-l CAfile] [-m] " "[-n domain_name] " "[-p CApath] [-r CRLfile] " "[-u] [-v] [-W] [-w]\n", argv[0]); exit(1); } } if (rpctls_do_mutual && rpctls_verify_cafile == NULL && rpctls_verify_capath == NULL) errx(1, "-m requires the -l and/or " "-p options"); if (rpctls_comparehost && (!rpctls_do_mutual || (rpctls_verify_cafile == NULL && rpctls_verify_capath == NULL))) errx(1, "-h requires the -m plus the " "-l and/or -p options"); if (!rpctls_comparehost && rpctls_wildcard != X509_CHECK_FLAG_NO_WILDCARDS) errx(1, "The -w or -W options require the -h option"); if (rpctls_cnuser && (!rpctls_do_mutual || (rpctls_verify_cafile == NULL && rpctls_verify_capath == NULL))) errx(1, "-u requires the -m plus the " "-l and/or -p options"); if (modfind("krpc") < 0) { /* Not present in kernel, try loading it */ if (kldload("krpc") < 0 || modfind("krpc") < 0) errx(1, "Kernel RPC is not available"); } if (rpctls_debug_level == 0) { if (daemon(0, 0) != 0) err(1, "Can't daemonize"); signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); signal(SIGHUP, SIG_IGN); } signal(SIGTERM, rpctlssd_terminate); signal(SIGPIPE, rpctlssd_terminate); signal(SIGHUP, rpctls_huphandler); pidfile_write(rpctls_pfh); memset(&sun, 0, sizeof sun); sun.sun_family = AF_LOCAL; unlink(_PATH_RPCTLSSDSOCK); strcpy(sun.sun_path, _PATH_RPCTLSSDSOCK); sun.sun_len = SUN_LEN(&sun); fd = socket(AF_LOCAL, SOCK_STREAM, 0); if (fd < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't create local rpctlssd socket"); exit(1); } err(1, "Can't create local rpctlssd socket"); } oldmask = umask(S_IXUSR|S_IRWXG|S_IRWXO); if (bind(fd, (struct sockaddr *)&sun, sun.sun_len) < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't bind local rpctlssd socket"); exit(1); } err(1, "Can't bind local rpctlssd socket"); } umask(oldmask); if (listen(fd, SOMAXCONN) < 0) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't listen on local rpctlssd socket"); exit(1); } err(1, "Can't listen on local rpctlssd socket"); } xprt = svc_vc_create(fd, RPC_MAXDATASIZE, RPC_MAXDATASIZE); if (!xprt) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't create transport for local rpctlssd socket"); exit(1); } err(1, "Can't create transport for local rpctlssd socket"); } if (!svc_reg(xprt, RPCTLSSD, RPCTLSSDVERS, rpctlssd_1, NULL)) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't register service for local rpctlssd socket"); exit(1); } err(1, "Can't register service for local rpctlssd socket"); } rpctls_ctx = rpctls_setup_ssl(rpctls_certdir); if (rpctls_ctx == NULL) { if (rpctls_debug_level == 0) { syslog(LOG_ERR, "Can't create SSL context"); exit(1); } err(1, "Can't create SSL context"); } rpctls_gothup = false; LIST_INIT(&rpctls_ssllist); gssd_syscall(_PATH_RPCTLSSDS); svc_run(); gssd_syscall("S"); SSL_CTX_free(rpctls_ctx); EVP_cleanup(); return (0); } static void rpctlssd_verbose_out(const char *fmt, ...) { va_list ap; if (rpctls_verbose) { va_start(ap, fmt); if (rpctls_debug_level == 0) vsyslog(LOG_INFO | LOG_DAEMON, fmt, ap); else vfprintf(stderr, fmt, ap); va_end(ap); } } bool_t rpctlssd_null_1_svc(void *argp, void *result, struct svc_req *rqstp) { rpctlssd_verbose_out("rpctlssd_null_svc: done\n"); return (TRUE); } bool_t rpctlssd_connect_1_svc(void *argp, struct rpctlssd_connect_res *result, struct svc_req *rqstp) { int ngrps, s; SSL *ssl; uint32_t flags; struct ssl_entry *newslp; uint32_t uid; uint32_t *gidp; rpctlssd_verbose_out("rpctlsd_connect_svc: started\n"); memset(result, 0, sizeof(*result)); /* Get the socket fd from the kernel. */ s = gssd_syscall("E"); rpctlssd_verbose_out("rpctlsd_connect_svc s=%d\n", s); if (s < 0) return (FALSE); /* Do the server side of a TLS handshake. */ gidp = calloc(NGROUPS, sizeof(*gidp)); ssl = rpctls_server(rpctls_ctx, s, &flags, &uid, &ngrps, gidp); if (ssl == NULL) { free(gidp); rpctlssd_verbose_out("rpctlssd_connect_svc: ssl " "accept failed\n"); /* * For RPC-over-TLS, this upcall is expected * to close off the socket. */ close(s); return (FALSE); } else { rpctlssd_verbose_out("rpctlssd_connect_svc: " "succeeded flags=0x%x\n", flags); result->flags = flags; result->sec = rpctls_ssl_sec; result->usec = rpctls_ssl_usec; result->ssl = ++rpctls_ssl_refno; /* Hard to believe this could ever wrap around.. */ if (rpctls_ssl_refno == 0) result->ssl = ++rpctls_ssl_refno; if ((flags & RPCTLS_FLAGS_CERTUSER) != 0) { result->uid = uid; result->gid.gid_len = ngrps; result->gid.gid_val = gidp; } else { result->uid = 0; result->gid.gid_len = 0; result->gid.gid_val = gidp; } } /* Maintain list of all current SSL *'s */ newslp = malloc(sizeof(*newslp)); newslp->ssl = ssl; newslp->s = s; newslp->refno = rpctls_ssl_refno; LIST_INSERT_HEAD(&rpctls_ssllist, newslp, next); return (TRUE); } bool_t +rpctlssd_handlerecord_1_svc(struct rpctlssd_handlerecord_arg *argp, + void *result, struct svc_req *rqstp) +{ + struct ssl_entry *slp; + int ret; + char junk; + + slp = NULL; + if (argp->sec == rpctls_ssl_sec && argp->usec == + rpctls_ssl_usec) { + LIST_FOREACH(slp, &rpctls_ssllist, next) { + if (slp->refno == argp->ssl) + break; + } + } + + if (slp != NULL) { + rpctlssd_verbose_out("rpctlssd_handlerecord fd=%d\n", + slp->s); + /* + * An SSL_read() of 0 bytes should fail, but it should + * handle the non-application data record before doing so. + */ + ret = SSL_read(slp->ssl, &junk, 0); + if (ret <= 0) { + /* Check to see if this was a close alert. */ + ret = SSL_get_shutdown(slp->ssl); +rpctlssd_verbose_out("get_shutdown=%d\n", ret); + if ((ret & (SSL_SENT_SHUTDOWN | + SSL_RECEIVED_SHUTDOWN)) == SSL_RECEIVED_SHUTDOWN) + SSL_shutdown(slp->ssl); + } else { + if (rpctls_debug_level == 0) + syslog(LOG_ERR, "SSL_read returned %d", ret); + else + fprintf(stderr, "SSL_read returned %d\n", ret); + } + } else + return (FALSE); + return (TRUE); +} + +bool_t rpctlssd_disconnect_1_svc(struct rpctlssd_disconnect_arg *argp, void *result, struct svc_req *rqstp) { struct ssl_entry *slp; slp = NULL; if (argp->sec == rpctls_ssl_sec && argp->usec == rpctls_ssl_usec) { LIST_FOREACH(slp, &rpctls_ssllist, next) { if (slp->refno == argp->ssl) break; } } if (slp != NULL) { rpctlssd_verbose_out("rpctlssd_disconnect fd=%d closed\n", slp->s); LIST_REMOVE(slp, next); SSL_free(slp->ssl); /* * For RPC-over-TLS, this upcall is expected * to close off the socket. */ + shutdown(slp->s, SHUT_WR); close(slp->s); free(slp); } else return (FALSE); return (TRUE); } int rpctlssd_1_freeresult(SVCXPRT *transp, xdrproc_t xdr_result, caddr_t result) { rpctlssd_connect_res *res; if (xdr_result == (xdrproc_t)xdr_rpctlssd_connect_res) { res = (rpctlssd_connect_res *)result; if (res->gid.gid_val != NULL) free(res->gid.gid_val); } return (TRUE); } static void rpctlssd_terminate(int sig __unused) { gssd_syscall("S"); pidfile_remove(rpctls_pfh); exit(0); } /* Allow the handshake to proceed. */ static int rpctls_verify_callback(int preverify_ok, X509_STORE_CTX *x509_ctx) { return (1); } static SSL_CTX * rpctls_setup_ssl(const char *certdir) { SSL_CTX *ctx; char path[PATH_MAX]; size_t len, rlen; int ret; SSL_library_init(); SSL_load_error_strings(); OpenSSL_add_all_algorithms(); ctx = SSL_CTX_new(TLS_server_method()); if (ctx == NULL) { rpctlssd_verbose_out("rpctls_setup_ssl: SSL_CTX_new failed\n"); return (NULL); } SSL_CTX_set_ecdh_auto(ctx, 1); /* * Set preferred ciphers, since KERN_TLS only supports a * few of them. */ ret = SSL_CTX_set_cipher_list(ctx, _PREFERRED_CIPHERS); if (ret == 0) { rpctlssd_verbose_out("rpctls_setup_ssl: " "SSL_CTX_set_cipher_list failed to set any ciphers\n"); SSL_CTX_free(ctx); return (NULL); } /* Get the cert.pem and key.pem files from the directory certdir. */ len = strlcpy(path, certdir, sizeof(path)); rlen = sizeof(path) - len; if (strlcpy(&path[len], "cert.pem", rlen) != 8) { SSL_CTX_free(ctx); return (NULL); } ret = SSL_CTX_use_certificate_file(ctx, path, SSL_FILETYPE_PEM); if (ret != 1) { rpctlssd_verbose_out("rpctls_setup_ssl: can't use certificate " "file path=%s ret=%d\n", path, ret); SSL_CTX_free(ctx); return (NULL); } if (strlcpy(&path[len], "key.pem", rlen) != 7) { SSL_CTX_free(ctx); return (NULL); } ret = SSL_CTX_use_PrivateKey_file(ctx, path, SSL_FILETYPE_PEM); if (ret != 1) { rpctlssd_verbose_out("rpctls_setup_ssl: Can't use private " "key path=%s ret=%d\n", path, ret); SSL_CTX_free(ctx); return (NULL); } /* Set Mutual authentication, as required. */ if (rpctls_do_mutual) { if (rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL) { if (rpctls_crlfile != NULL) { ret = rpctls_loadcrlfile(ctx); if (ret == 0) { rpctlssd_verbose_out("rpctls_setup_ssl:" " Load CRLfile failed\n"); SSL_CTX_free(ctx); return (NULL); } } ret = SSL_CTX_load_verify_locations(ctx, rpctls_verify_cafile, rpctls_verify_capath); if (ret == 0) { rpctlssd_verbose_out("rpctls_setup_ssl: " "Can't load verify locations\n"); SSL_CTX_free(ctx); return (NULL); } if (rpctls_verify_cafile != NULL) SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file( rpctls_verify_cafile)); } SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, rpctls_verify_callback); } return (ctx); } static SSL * rpctls_server(SSL_CTX *ctx, int s, uint32_t *flags, uint32_t *uidp, int *ngrps, uint32_t *gidp) { SSL *ssl; X509 *cert; struct sockaddr *sad; struct sockaddr_storage ad; char hostnam[NI_MAXHOST]; int gethostret, ret; char *cp, *cp2; *flags = 0; sad = (struct sockaddr *)&ad; if (rpctls_gothup) { rpctls_gothup = false; ret = rpctls_loadcrlfile(ctx); if (ret == 0) rpctlssd_verbose_out("rpctls_server: Can't " "reload CRLfile\n"); } ssl = SSL_new(ctx); if (ssl == NULL) { rpctlssd_verbose_out("rpctls_server: SSL_new failed\n"); return (NULL); } if (SSL_set_fd(ssl, s) != 1) { rpctlssd_verbose_out("rpctls_server: SSL_set_fd failed\n"); SSL_free(ssl); return (NULL); } ret = SSL_accept(ssl); if (ret != 1) { rpctlssd_verbose_out("rpctls_server: SSL_accept " "failed ret=%d\n", ret); SSL_free(ssl); return (NULL); } *flags |= RPCTLS_FLAGS_HANDSHAKE; if (rpctls_do_mutual) { cert = SSL_get_peer_certificate(ssl); if (cert != NULL) { gethostret = rpctls_gethost(s, sad, hostnam, sizeof(hostnam)); if (gethostret == 0) hostnam[0] = '\0'; cp2 = X509_NAME_oneline( X509_get_subject_name(cert), NULL, 0); rpctlssd_verbose_out("%s\n", cp2); *flags |= RPCTLS_FLAGS_GOTCERT; ret = SSL_get_verify_result(ssl); if (ret != X509_V_OK) { cp = X509_NAME_oneline( X509_get_issuer_name(cert), NULL, 0); if (rpctls_debug_level == 0) syslog(LOG_INFO | LOG_DAEMON, "rpctls_server: client IP %s " "issuerName=%s subjectName=%s" " verify failed %s\n", hostnam, cp, cp2, X509_verify_cert_error_string(ret)); else fprintf(stderr, "rpctls_server: client IP %s " "issuerName=%s subjectName=%s" " verify failed %s\n", hostnam, cp, cp2, X509_verify_cert_error_string(ret)); } if (ret == X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT || ret == X509_V_ERR_SELF_SIGNED_CERT_IN_CHAIN) *flags |= RPCTLS_FLAGS_SELFSIGNED; else if (ret == X509_V_OK) { if (rpctls_comparehost) { ret = 0; if (gethostret != 0) ret = rpctls_checkhost(sad, cert); if (ret != 1) { *flags |= RPCTLS_FLAGS_DISABLED; rpctlssd_verbose_out( "rpctls_server: " "checkhost " "failed\n"); } } if (rpctls_cnuser) { ret = rpctls_cnname(cert, uidp, ngrps, gidp); if (ret != 0) *flags |= RPCTLS_FLAGS_CERTUSER; } *flags |= RPCTLS_FLAGS_VERIFIED; } X509_free(cert); } else rpctlssd_verbose_out("rpctls_server: " "No peer certificate\n"); } /* Check to see that ktls is working for the connection. */ ret = BIO_get_ktls_send(SSL_get_wbio(ssl)); rpctlssd_verbose_out("rpctls_server: BIO_get_ktls_send=%d\n", ret); if (ret != 0) { ret = BIO_get_ktls_recv(SSL_get_rbio(ssl)); rpctlssd_verbose_out("rpctls_server: BIO_get_ktls_recv=%d\n", ret); } if (ret == 0) { if (rpctls_debug_level == 0) - syslog(LOG_ERR, "ktls not working\n"); + syslog(LOG_ERR, "ktls not working"); else fprintf(stderr, "ktls not working\n"); /* * The handshake has completed, so all that can be * done is disable the connection. */ *flags |= RPCTLS_FLAGS_DISABLED; } return (ssl); } /* * Get the client's IP address. */ static int rpctls_gethost(int s, struct sockaddr *sad, char *hostip, size_t hostlen) { socklen_t slen; int ret; slen = sizeof(struct sockaddr_storage); if (getpeername(s, sad, &slen) < 0) return (0); ret = 0; if (getnameinfo((const struct sockaddr *)sad, sad->sa_len, hostip, hostlen, NULL, 0, NI_NUMERICHOST) == 0) { rpctlssd_verbose_out("rpctls_gethost: %s\n", hostip); ret = 1; } return (ret); } /* * Check a client IP address against any host address in the * certificate. Basically getnameinfo(3) and * X509_check_host(). */ static int rpctls_checkhost(struct sockaddr *sad, X509 *cert) { char hostnam[NI_MAXHOST]; int ret; if (getnameinfo((const struct sockaddr *)sad, sad->sa_len, hostnam, sizeof(hostnam), NULL, 0, NI_NAMEREQD) != 0) return (0); rpctlssd_verbose_out("rpctls_checkhost: DNS %s\n", hostnam); ret = X509_check_host(cert, hostnam, strlen(hostnam), rpctls_wildcard, NULL); return (ret); } /* * Acquire the dnsname for this server. */ static char * rpctls_getdnsname(char *hostname) { char *cp, *dnsname; struct addrinfo *aip, hints; int error; dnsname = NULL; if (gethostname(hostname, MAXHOSTNAMELEN) == 0) { if ((cp = strchr(hostname, '.')) != NULL && *(cp + 1) != '\0') { *cp = '@'; dnsname = cp; } else { memset((void *)&hints, 0, sizeof (hints)); hints.ai_flags = AI_CANONNAME; error = getaddrinfo(hostname, NULL, &hints, &aip); if (error == 0) { if (aip->ai_canonname != NULL && (cp = strchr(aip->ai_canonname, '.')) != NULL && *(cp + 1) != '\0') { hostname[0] = '@'; strlcpy(&hostname[1], cp + 1, MAXHOSTNAMELEN + 1); dnsname = hostname; } freeaddrinfo(aip); } } } return (dnsname); } /* * Check a commonName to see if it maps to "user@domain" and * acquire a for it if it does. */ static int rpctls_cnname(X509 *cert, uint32_t *uidp, int *ngrps, uint32_t *gidp) { char *cp, usern[1024 + 1]; struct passwd *pwd; gid_t gids[NGROUPS]; int i; GENERAL_NAMES *genlist; GENERAL_NAME *genname; OTHERNAME *val; /* First, find the otherName in the subjectAltName. */ genlist = X509_get_ext_d2i(cert, NID_subject_alt_name, NULL, NULL); rpctlssd_verbose_out("genlist=%p\n", genlist); if (genlist == NULL) return (0); val = NULL; for (i = 0; i < sk_GENERAL_NAME_num(genlist); i++) { genname = sk_GENERAL_NAME_value(genlist, i); if (genname->type != GEN_OTHERNAME) continue; val = genname->d.otherName; break; } if (val == NULL) return (0); rpctlssd_verbose_out("fnd type=0x%x len=%d anstyp=0x%x data=%s\n", val->value->type, val->value->value.utf8string->length, val->value->value.utf8string->type, val->value->value.utf8string->data); /* Check to see that it is the correct OID. */ i = i2t_ASN1_OBJECT(usern, sizeof(usern), val->type_id); rpctlssd_verbose_out("obj=%d str=%s\n", i, usern); if (i != strlen(rpctls_cnuseroid) || memcmp(usern, rpctls_cnuseroid, i) != 0) { rpctlssd_verbose_out("rpctls_cnname: invalid cnuser " "oid len=%d val=%s\n", i, usern); return (0); } /* Sanity check the otherName. */ if (val->value->type != V_ASN1_UTF8STRING || val->value->value.utf8string->length < 3 || val->value->value.utf8string->length > sizeof(usern) - 1) { rpctlssd_verbose_out("rpctls_cnname: invalid cnuser " "type=%d\n", val->value->type); return (0); } /* Look for a "user" in the otherName */ memcpy(usern, val->value->value.utf8string->data, val->value->value.utf8string->length); usern[val->value->value.utf8string->length] = '\0'; rpctlssd_verbose_out("rpctls_cnname: userstr %s\n", usern); /* Now, look for the @dnsname suffix in the commonName. */ cp = strcasestr(usern, rpctls_dnsname); if (cp == NULL) return (0); rpctlssd_verbose_out("dns=%s\n", cp); if (*(cp + strlen(rpctls_dnsname)) != '\0') return (0); *cp = '\0'; /* See if the "user" is in the passwd database. */ rpctlssd_verbose_out("user=%s\n", usern); pwd = getpwnam(usern); if (pwd == NULL) return (0); rpctlssd_verbose_out("pwname=%s\n", pwd->pw_name); *uidp = pwd->pw_uid; *ngrps = NGROUPS; if (getgrouplist(pwd->pw_name, pwd->pw_gid, gids, ngrps) < 0) return (0); for (i = 0; i < *ngrps; i++) gidp[i] = gids[i]; return (1); } /* * (re)load the CRLfile into the certificate verification store. */ static int rpctls_loadcrlfile(SSL_CTX *ctx) { X509_STORE *certstore; X509_LOOKUP *certlookup; int ret; if ((rpctls_verify_cafile != NULL || rpctls_verify_capath != NULL) && rpctls_crlfile != NULL) { certstore = SSL_CTX_get_cert_store(ctx); certlookup = X509_STORE_add_lookup( certstore, X509_LOOKUP_file()); ret = 0; if (certlookup != NULL) ret = X509_load_crl_file(certlookup, rpctls_crlfile, X509_FILETYPE_PEM); if (ret != 0) ret = X509_STORE_set_flags(certstore, X509_V_FLAG_CRL_CHECK | X509_V_FLAG_CRL_CHECK_ALL); if (ret == 0) { rpctlssd_verbose_out( "rpctls_loadcrlfile: Can't" " load CRLfile=%s\n", rpctls_crlfile); return (ret); } } return (1); } static void rpctls_huphandler(int sig __unused) { rpctls_gothup = true; }