diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 18ecf1409fd7..9a29df8b0340 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4384 +1,4283 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree(), sorele(), sonewconn() and sorflush(), which are usually called * from a pre-set VNET context. sopoll() currently does not need a VNET * context to be set. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr && newmaxsockets != maxsockets) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); mtx_destroy(&so->so_snd_mtx); mtx_destroy(&so->so_rcv_mtx); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1 and a file descriptor * reference. The socket should be closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; /* * XXX: divert(4) historically abused PF_INET. Keep this compatibility * shim until all applications have been updated. */ if (__predict_false(dom == PF_INET && type == SOCK_RAW && proto == IPPROTO_DIVERT)) { dom = PF_DIVERT; printf("%s uses obsolete way to create divert(4) socket\n", td->td_proc->p_comm); } prp = pffindproto(dom, type, proto); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } MPASS(prp->pr_attach); if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) return (ECAPMODE); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); if ((prp->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = prp->pr_attach(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static int sooverprio = LOG_DEBUG; SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which supports * accept(2), the protocol has two options: * 1) Call legacy sonewconn() function, which would call protocol attach * method, same as used for socket(2). * 2) Call solisten_clone(), do attach that is specific to a cloned connection, * and then call solisten_enqueue(). * * Note: the ref count on the socket is 0 on return. */ struct socket * solisten_clone(struct socket *head) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = (sooverprio >= 0) && !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); /* * Preserve the historic listen queue overflow log * message, that starts with "sonewconn:". It has * been known to sysadmins for years and also test * sys/kern/sonewconn_overflow checks for it. */ if (head->so_cred == 0) { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: %i already in " "queue awaiting acceptance (%d " "occurrences)\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount); } else { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences), euid %d, rgid %d, jail %s\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount, head->so_cred->cr_uid, head->so_cred->cr_rgid, head->so_cred->cr_prison ? head->so_cred->cr_prison->pr_name : "not_jailed"); } sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; /* * POSIX is ambiguous on what options an accept(2)ed socket should * inherit from the listener. Words "create a new socket" may be * interpreted as not inheriting anything. Best programming practice * for application developers is to not rely on such inheritance. * FreeBSD had historically inherited all so_options excluding * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, * including those completely irrelevant to a new born socket. For * compatibility with older versions we will inherit a list of * meaningful options. */ so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); so->so_linger = head->so_linger; so->so_state = head->so_state; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } return (so); } /* Connstatus may be 0 or SS_ISCONNECTED. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; if ((so = solisten_clone(head)) == NULL) return (NULL); if (so->so_proto->pr_attach(so, 0, NULL) != 0) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); return (NULL); } (void)solisten_enqueue(so, connstatus); return (so); } /* * Enqueue socket cloned by solisten_clone() to the listen queue of the * listener it has been cloned from. * * Return 'true' if socket landed on complete queue, otherwise 'false'. */ bool solisten_enqueue(struct socket *so, int connstatus) { struct socket *head = so->so_listen; MPASS(refcount_load(&so->so_count) == 0); refcount_init(&so->so_count, 1); SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ return (true); } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); return (false); } } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. * * XXXGL: reduce copy-paste with solisten_clone(). */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bind(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bindat(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_listen(so, backlog, td); CURVNET_RESTORE(); return (error); } /* * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in * order to interlock with socket I/O. */ int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) return (EINVAL); /* * Sleeping is not permitted here, so simply fail if userspace is * attempting to transmit or receive on the socket. This kind of * transient failure is not ideal, but it should occur only if userspace * is misusing the socket interfaces. */ if (!sx_try_xlock(&so->so_snd_sx)) return (EAGAIN); if (!sx_try_xlock(&so->so_rcv_sx)) { sx_xunlock(&so->so_snd_sx); return (EAGAIN); } mtx_lock(&so->so_snd_mtx); mtx_lock(&so->so_rcv_mtx); /* Interlock with soo_aio_queue() and KTLS. */ if (!SOLISTENING(so)) { bool ktls; #ifdef KERN_TLS ktls = so->so_snd.sb_tls_info != NULL || so->so_rcv.sb_tls_info != NULL; #else ktls = false; #endif if (ktls || (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { solisten_proto_abort(so); return (EINVAL); } } return (0); } /* * Undo the setup done by solisten_proto_check(). */ void solisten_proto_abort(struct socket *so) { mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0, ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. The socket reference held by the * listen queue is handed to the caller. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele_locked(head); *ret = so; return (0); } /* * Free socket upon release of the very last reference. */ static void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); KASSERT(refcount_load(&so->so_count) == 0, ("%s: so %p has references", __func__, so)); KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, ("%s: so %p is on listen queue", __func__, so)); SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_detach != NULL) pr->pr_detach(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. */ if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Release a reference on a socket while holding the socket lock. * Unlocks the socket lock before returning. */ void sorele_locked(struct socket *so) { SOCK_LOCK_ASSERT(so); if (refcount_release(&so->so_count)) sofree(so); else SOCK_UNLOCK(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; int error = 0; bool listening, last __diagused; CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_close != NULL) so->so_proto->pr_close(so); SOCK_LOCK(so); if ((listening = SOLISTENING(so))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } sorele_locked(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) soabort(sp); } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on. Likely this thread holds the last * reference on the socket and soabort() will proceed with sofree(). But * it might be not the last, as the sockets on the listen queues are seen * from the protocol side. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. * * Usually socket should have a single reference left, but this is not a * requirement. In the past, when we have had named references for file * descriptor and protocol, we asserted that none of them are being held. */ void soabort(struct socket *so) { VNET_SO_ASSERT(so); if (so->so_proto->pr_abort != NULL) so->so_proto->pr_abort(so); SOCK_LOCK(so); sorele_locked(so); } int soaccept(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_accept(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int sopeeraddr(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_peeraddr(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int sosockaddr(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sockaddr(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. * * Note, this check is racy and may need to be re-evaluated at the * protocol layer. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = so->so_proto->pr_connect(so, nam, td); } else { error = so->so_proto->pr_connectat(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = so1->so_proto->pr_connect2(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = so->so_proto->pr_disconnect(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if (!(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pr_send_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_send_flag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_send_flag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_send_flag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } if (resid == 0 && !ktls_permit_empty_frames(tls)) { error = EINVAL; goto release; } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if (!(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pr_send_flag |= tls_send_flag; #endif error = so->so_proto->pr_send(so, pr_send_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send to a socket from a kernel thread. * * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs * to be set at all. This function should just boil down to a static inline * calling the protocol method. */ int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * send(2), write(2) or aio_write(2) on a socket. */ int sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *control, int flags, struct proc *userproc) { struct thread *td; ssize_t len; int error; td = uio->uio_td; len = uio->uio_resid; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, td); CURVNET_RESTORE(); if (error != 0) { /* * Clear transient errors for stream protocols if they made * some progress. Make exclusion for aio(4) that would * schedule a new write in case of EWOULDBLOCK and clear * error itself. See soaio_process_job(). */ if (uio->uio_resid != len && (so->so_proto->pr_flags & PR_ATOMIC) == 0 && userproc == NULL && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && (flags & MSG_NOSIGNAL) == 0) { if (userproc != NULL) { /* aio(4) job */ PROC_LOCK(userproc); kern_psignal(userproc, SIGPIPE); PROC_UNLOCK(userproc); } else { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } } return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; bool report_real_len = false; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) { report_real_len = *flagsp & MSG_TRUNC; *flagsp &= ~MSG_TRUNC; flags = *flagsp &~ MSG_EOR; } else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0 && !report_real_len) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for an alert record. * If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); if (__predict_false(tgr.tls_type == TLS_RLTYPE_ALERT)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); if (__predict_false(so->so_rcv.sb_mb == NULL && (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE))) break; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { if (report_real_len) uio->uio_resid -= m_length(m, NULL) - moff; flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(so, SO_RCV); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); so->so_proto->pr_rcvd(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, enum shutdown_how how) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_shutdown(so, how); CURVNET_RESTORE(); return (error); } /* * Used by several pr_shutdown implementations that use generic socket buffers. */ void sorflush(struct socket *so) { int error; VNET_SO_ASSERT(so); /* * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. * * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive * methods that read the top of the socket buffer without acquisition * of the socket buffer mutex, assuming that top of the buffer * exclusively belongs to the read(2) syscall. This is handy when * performing MSG_PEEK. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) { KASSERT(SOLISTENING(so), ("%s: soiolock(%p) failed", __func__, so)); return; } sbrelease(so, SO_RCV); SOCK_IO_RECV_UNLOCK(so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val, *valp; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = so->so_proto->pr_setsbopt(so, sopt); if (error) goto bad; break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); SOCK_LOCK(so); valp = sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? &so->sol_sbsnd_timeo : &so->so_snd.sb_timeo) : (SOLISTENING(so) ? &so->sol_sbrcv_timeo : &so->so_rcv.sb_timeo); *valp = val; SOCK_UNLOCK(so); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: SOCK_LOCK(so); tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? so->sol_sbsnd_timeo : so->so_snd.sb_timeo) : (SOLISTENING(so) ? so->sol_sbrcv_timeo : so->so_rcv.sb_timeo)); SOCK_UNLOCK(so); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; /* Don't copy out extmac, it is unchanged. */ #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; /* Don't copy out extmac, it is unchanged. */ #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCK_SENDBUF_LOCK(so); SOCK_RECVBUF_LOCK(so); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; sb_which which; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; which = SO_RCV; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCK_BUF_LOCK(so, which); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCK_BUF_UNLOCK(so, which); } SOCK_UNLOCK(so); return (0); } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCK_RECVBUF_LOCK_ASSERT(so); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele_locked(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCK_RECVBUF_LOCK(so); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCK_RECVBUF_UNLOCK(so); goto again; } SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, sb_which which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_RECVBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_RECVBUF_UNLOCK(so); goto retry; } } } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_RECVBUF_UNLOCK(so); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_RECVBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_RECVBUF_UNLOCK_ASSERT(so); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_SENDBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_SENDBUF_UNLOCK(so); goto retry; } } } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_SENDBUF_UNLOCK(so); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_SENDBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_SENDBUF_UNLOCK_ASSERT(so); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } -struct sockbuf * -so_sockbuf_rcv(struct socket *so) -{ - - return (&so->so_rcv); -} - -struct sockbuf * -so_sockbuf_snd(struct socket *so) -{ - - return (&so->so_snd); -} - -int -so_state_get(const struct socket *so) -{ - - return (so->so_state); -} - -void -so_state_set(struct socket *so, int val) -{ - - so->so_state = val; -} - int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } - -int -so_linger_get(const struct socket *so) -{ - - return (so->so_linger); -} - -void -so_linger_set(struct socket *so, int val) -{ - - KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), - ("%s: val %d out of range", __func__, val)); - - so->so_linger = val; -} - -struct protosw * -so_protosw_get(const struct socket *so) -{ - - return (so->so_proto); -} - -void -so_protosw_set(struct socket *so, struct protosw *val) -{ - - so->so_proto = val; -} - -void -so_sorwakeup(struct socket *so) -{ - - sorwakeup(so); -} - -void -so_sowwakeup(struct socket *so) -{ - - sowwakeup(so); -} - -void -so_sorwakeup_locked(struct socket *so) -{ - - sorwakeup_locked(so); -} - -void -so_sowwakeup_locked(struct socket *so) -{ - - sowwakeup_locked(so); -} - -void -so_lock(struct socket *so) -{ - - SOCK_LOCK(so); -} - -void -so_unlock(struct socket *so) -{ - - SOCK_UNLOCK(so); -} diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 16a3707e7c89..d966f8e386fe 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,3514 +1,3507 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2021-2022 Gleb Smirnoff * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef INET #include #include #endif #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #endif #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Enable random ephemeral port allocation by default. */ VNET_DEFINE(int, ipport_randomized) = 1; #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); #ifdef RATELIMIT counter_u64_t rate_limit_new; counter_u64_t rate_limit_chg; counter_u64_t rate_limit_active; counter_u64_t rate_limit_alloc_fail; counter_u64_t rate_limit_set_ok; static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "IP Rate Limiting"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, &rate_limit_active, "Active rate limited connections"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, &rate_limit_alloc_fail, "Rate limited connection failures"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, &rate_limit_set_ok, "Rate limited setting succeeded"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, &rate_limit_new, "Total Rate limit new attempts"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, &rate_limit_chg, "Total Rate limited change attempts"); #endif /* RATELIMIT */ #endif /* INET */ VNET_DEFINE(uint32_t, in_pcbhashseed); static void in_pcbhashseed_init(void) { V_in_pcbhashseed = arc4random(); } VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, 0); static void in_pcbremhash(struct inpcb *); /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (grp == NULL) return (NULL); grp->il_cred = crhold(cred); grp->il_vflag = vflag; grp->il_lport = port; grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); crfree(grp->il_cred); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size, old_grp->il_numa_domain); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && grp->il_numa_domain == numa_domain && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) { break; } } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; inp->inp_flags |= INP_INLBGROUP; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); MPASS(inp->inp_flags & INP_INLBGROUP); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } inp->inp_flags &= ~INP_INLBGROUP; return; } } KASSERT(0, ("%s: did not find %p", __func__, inp)); } int in_pcblbgroup_numa(struct inpcb *inp, int arg) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int err, i; uint8_t numa_domain; switch (arg) { case TCP_REUSPORT_LB_NUMA_NODOM: numa_domain = M_NODOM; break; case TCP_REUSPORT_LB_NUMA_CURDOM: numa_domain = PCPU_GET(domain); break; default: if (arg < 0 || arg >= vm_ndomains) return (EINVAL); numa_domain = arg; } err = 0; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_numa_domain == numa_domain) { goto abort_with_hash_wlock; } /* Remove it from the old group. */ in_pcbremlbgrouphash(inp); /* Add it to the new group based on numa domain. */ in_pcbinslbgrouphash(inp, numa_domain); goto abort_with_hash_wlock; } } err = ENOENT; abort_with_hash_wlock: INP_HASH_WUNLOCK(pcbinfo); return (err); } /* Make sure it is safe to use hashinit(9) on CK_LIST. */ CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); /* * Initialize an inpcbinfo - a per-VNET instance of connections db. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, u_int hash_nelements, u_int porthash_nelements) { mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = pcbstor->ips_zone; pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); mtx_destroy(&pcbinfo->ipi_hash_lock); mtx_destroy(&pcbinfo->ipi_lock); } /* * Initialize a pcbstorage - per protocol zones to allocate inpcbs. */ static void inpcb_fini(void *, int); void in_pcbstorage_init(void *arg) { struct inpcbstorage *pcbstor = arg; pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_smr(pcbstor->ips_portzone, uma_zone_get_smr(pcbstor->ips_zone)); } /* * Destroy a pcbstorage - used by unloadable protocols. */ void in_pcbstorage_destroy(void *arg) { struct inpcbstorage *pcbstor = arg; uma_zdestroy(pcbstor->ips_zone); uma_zdestroy(pcbstor->ips_portzone); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) int error; #endif inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; #ifdef INET else inp->inp_vflag |= INP_IPV4; #endif if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; inp->in6p_hops = -1; /* use kernel default */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET inp->inp_vflag |= INP_IPV4; #endif inp->inp_smr = SMR_SEQ_INVALID; /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ INP_WLOCK(inp); INP_INFO_WLOCK(pcbinfo); pcbinfo->ipi_count++; inp->inp_gencnt = ++pcbinfo->ipi_gencnt; CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); so->so_pcb = inp; return (0); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: uma_zfree_smr(pcbinfo->ipi_zone, inp); return (error); #endif } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) { int anonport, error; KASSERT(sin == NULL || sin->sin_family == AF_INET, ("%s: invalid address family for %p", __func__, sin)); KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), ("%s: invalid address length for %p", __func__, sin)); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = sin == NULL || sin->sin_port == 0; error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif #if defined(INET) || defined(INET6) /* * Assign a local port like in_pcb_lport(), but also used with connect() * and a foreign address and port. If fsa is non-NULL, choose a local port * that is unused with those, otherwise one that is completely unused. * lsa can be NULL for IPv6. */ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr, faddr; #endif #ifdef INET6 struct in6_addr *laddr6, *faddr6; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { if (lsa != NULL) laddr = ((struct sockaddr_in *)lsa)->sin_addr; if (fsa != NULL) faddr = ((struct sockaddr_in *)fsa)->sin_addr; } #endif #ifdef INET6 laddr6 = NULL; if ((inp->inp_vflag & INP_IPV6) != 0) { if (lsa != NULL) laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; if (fsa != NULL) faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; } #endif tmpinp = NULL; lport = *lportp; if (V_ipport_randomized) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); if (fsa != NULL) { #ifdef INET if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, M_NODOM); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, M_NODOM); } #endif } else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #ifdef INET if (tmpinp == NULL && (inp->inp_vflag & INP_IPV4)) tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } } while (tmpinp != NULL); *lportp = lport; return (0); } /* * Select a local port (number) to use. */ int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct sockaddr_in laddr; if (laddrp) { bzero(&laddr, sizeof(laddr)); laddr.sin_family = AF_INET; laddr.sin_addr = *laddrp; } return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : NULL, lportp, NULL, 0, cred, lookupflags)); } #endif /* INET || INET6 */ #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); laddr.s_addr = *laddrp; if (sin != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (sin == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { KASSERT(sin->sin_family == AF_INET, ("%s: invalid family for address %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid length for address %p", __func__, sin)); error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t != NULL && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_socket->so_options & SO_REUSEPORT) || (t->inp_socket->so_options & SO_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 && (reuseport_lb & t->inp_socket->so_options) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, bool rehash __unused) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); KASSERT(in_nullhost(inp->inp_faddr), ("%s: inp is already connected", __func__)); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, cred); if (error) return (error); inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_lport = inp->inp_fport = 0; return (EAGAIN); } } else { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if ((inp->inp_flags & INP_INHASHLIST) != 0) in_pcbrehash(inp); else in_pcbinshash(inp); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin, dst; struct nhop_object *nh; int error; NET_EPOCH_ASSERT(); KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (!prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; nh = NULL; bzero(&dst, sizeof(dst)); sin = &dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 0, NHR_NONE, 0); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (nh == NULL || nh->nh_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (!prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (!prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = nh->nh_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { struct in_ifaddr *ia; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); if (!prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (error == 0 && laddr->s_addr == INADDR_ANY) return (EHOSTUNREACH); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct ucred *cred) { struct in_ifaddr *ia; struct in_addr laddr, faddr; u_short lport, fport; int error; KASSERT(sin->sin_family == AF_INET, ("%s: invalid address family for %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid address length for %p", __func__, sin)); /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ NET_EPOCH_ASSERT(); INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_val, hash_type; hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, inp->inp_socket->so_proto->pr_protocol, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; if ((error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == ifp && prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } } } if (error) return (error); } if (lport != 0) { if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, M_NODOM) != NULL) return (EADDRINUSE); } else { struct sockaddr_in lsin, fsin; bzero(&lsin, sizeof(lsin)); bzero(&fsin, sizeof(fsin)); lsin.sin_family = AF_INET; lsin.sin_addr = laddr; fsin.sin_family = AF_INET; fsin.sin_addr = faddr; error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, &lport, (struct sockaddr *)& fsin, fport, cred, INPLOOKUP_WILDCARD); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp %p was already disconnected", __func__, inp)); in_pcbremhash_locked(inp); /* See the comment in in_pcbinshash(). */ inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; } #endif /* INET */ /* * inpcb hash lookups are protected by SMR section. * * Once desired pcb has been found, switching from SMR section to a pcb * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK * here because SMR is a critical section. * In 99%+ cases inp_smr_lock() would obtain the lock immediately. */ void inp_lock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); } void inp_unlock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); } int inp_trylock(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); } static inline bool _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) { MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); if (__predict_true(inp_trylock(inp, lock))) { if (__predict_false(inp->inp_flags & ignflags)) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_unlock(inp, lock); return (false); } smr_exit(inp->inp_pcbinfo->ipi_smr); return (true); } if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) return (false); /* * inp acquired through refcount & lock for sure didn't went * through uma_zfree(). However, it may have already went * through in_pcbfree() and has another reference, that * prevented its release by our in_pcbrele(). */ if (__predict_false(inp->inp_flags & ignflags)) { inp_unlock(inp, lock); return (false); } return (true); } else { smr_exit(inp->inp_pcbinfo->ipi_smr); return (false); } } bool inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) { /* * in_pcblookup() family of functions ignore not only freed entries, * that may be found due to lockless access to the hash, but dropped * entries, too. */ return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); } /* * inp_next() - inpcb hash/list traversal iterator * * Requires initialized struct inpcb_iterator for context. * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). * * - Iterator can have either write-lock or read-lock semantics, that can not * be changed later. * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through * a single hash slot. Note: only rip_input() does the latter. * - Iterator may have optional bool matching function. The matching function * will be executed for each inpcb in the SMR context, so it can not acquire * locks and can safely access only immutable fields of inpcb. * * A fresh initialized iterator has NULL inpcb in its context and that * means that inp_next() call would return the very first inpcb on the list * locked with desired semantic. In all following calls the context pointer * shall hold the current inpcb pointer. The KPI user is not supposed to * unlock the current inpcb! Upon end of traversal inp_next() will return NULL * and write NULL to its context. After end of traversal an iterator can be * reused. * * List traversals have the following features/constraints: * - New entries won't be seen, as they are always added to the head of a list. * - Removed entries won't stop traversal as long as they are not added to * a different list. This is violated by in_pcbrehash(). */ #define II_LIST_FIRST(ipi, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) #define II_LIST_NEXT(inp, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_NEXT((inp), inp_list) : \ CK_LIST_NEXT((inp), inp_hash_exact)) #define II_LOCK_ASSERT(inp, lock) \ rw_assert(&(inp)->inp_lock, \ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) struct inpcb * inp_next(struct inpcb_iterator *ii) { const struct inpcbinfo *ipi = ii->ipi; inp_match_t *match = ii->match; void *ctx = ii->ctx; inp_lookup_t lock = ii->lock; int hash = ii->hash; struct inpcb *inp; if (ii->inp == NULL) { /* First call. */ smr_enter(ipi->ipi_smr); /* This is unrolled CK_LIST_FOREACH(). */ for (inp = II_LIST_FIRST(ipi, hash); inp != NULL; inp = II_LIST_NEXT(inp, hash)) { if (match != NULL && (match)(inp, ctx) == false) continue; if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) break; else { smr_enter(ipi->ipi_smr); MPASS(inp != II_LIST_FIRST(ipi, hash)); inp = II_LIST_FIRST(ipi, hash); if (inp == NULL) break; } } if (inp == NULL) smr_exit(ipi->ipi_smr); else ii->inp = inp; return (inp); } /* Not a first call. */ smr_enter(ipi->ipi_smr); restart: inp = ii->inp; II_LOCK_ASSERT(inp, lock); next: inp = II_LIST_NEXT(inp, hash); if (inp == NULL) { smr_exit(ipi->ipi_smr); goto found; } if (match != NULL && (match)(inp, ctx) == false) goto next; if (__predict_true(inp_trylock(inp, lock))) { if (__predict_false(inp->inp_flags & INP_FREED)) { /* * Entries are never inserted in middle of a list, thus * as long as we are in SMR, we can continue traversal. * Jump to 'restart' should yield in the same result, * but could produce unnecessary looping. Could this * looping be unbound? */ inp_unlock(inp, lock); goto next; } else { smr_exit(ipi->ipi_smr); goto found; } } /* * Can't obtain lock immediately, thus going hard. Once we exit the * SMR section we can no longer jump to 'next', and our only stable * anchoring point is ii->inp, which we keep locked for this case, so * we jump to 'restart'. */ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(ipi->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) { smr_enter(ipi->ipi_smr); goto restart; } /* * See comment in inp_smr_lock(). */ if (__predict_false(inp->inp_flags & INP_FREED)) { inp_unlock(inp, lock); smr_enter(ipi->ipi_smr); goto restart; } } else goto next; found: inp_unlock(ii->inp, lock); ii->inp = inp; return (ii->inp); } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released or * SMR section exited. * * To free a reference later in_pcbrele_(r|w)locked() must be performed. */ void in_pcbref(struct inpcb *inp) { u_int old __diagused; old = refcount_acquire(&inp->inp_refcount); KASSERT(old > 0, ("%s: refcount 0", __func__)); } /* * Drop a refcount on an inpcb elevated using in_pcbref(), potentially * freeing the pcb, if the reference was very last. */ bool in_pcbrele_rlocked(struct inpcb *inp) { INP_RLOCK_ASSERT(inp); if (!refcount_release(&inp->inp_refcount)) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; #endif INP_RUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } bool in_pcbrele_wlocked(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); if (!refcount_release(&inp->inp_refcount)) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; #endif INP_WUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } bool in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. * Almost all work, including removal from global lists, is done in this * context, where the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INET struct ip_moptions *imo; #endif #ifdef INET6 struct ip6_moptions *im6o; #endif INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); KASSERT((inp->inp_flags & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); /* * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb * from the hash without acquiring inpcb lock, they rely on the hash * lock, thus in_pcbremhash() should be the first action. */ if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); INP_INFO_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; pcbinfo->ipi_count--; CK_LIST_REMOVE(inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_flags |= INP_FREED; inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; RO_INVALIDATE_CACHE(&inp->inp_route); #ifdef MAC mac_inpcb_destroy(inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET if (inp->inp_options) (void)m_free(inp->inp_options); DEBUG_POISON_POINTER(inp->inp_options); imo = inp->inp_moptions; DEBUG_POISON_POINTER(inp->inp_moptions); #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); DEBUG_POISON_POINTER(inp->in6p_outputopts); im6o = inp->in6p_moptions; DEBUG_POISON_POINTER(inp->in6p_moptions); } else im6o = NULL; #endif if (__predict_false(in_pcbrele_wlocked(inp) == false)) { INP_WUNLOCK(inp); } #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); #ifdef INVARIANTS if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) MPASS(inp->inp_refcount > 1); #endif inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ int in_getsockaddr(struct socket *so, struct sockaddr *sa) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); *(struct sockaddr_in *)sa = (struct sockaddr_in ){ .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_port = inp->inp_lport, .sin_addr = inp->inp_laddr, }; return (0); } int in_getpeeraddr(struct socket *so, struct sockaddr *sa) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); *(struct sockaddr_in *)sa = (struct sockaddr_in ){ .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_port = inp->inp_fport, .sin_addr = inp->inp_faddr, }; return (0); } static bool inp_v4_multi_match(const struct inpcb *inp, void *v __unused) { if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) return (true); else return (false); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, inp_v4_multi_match, NULL); struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; IN_MULTI_LOCK_ASSERT(); while ((inp = inp_next(&inpi)) != NULL) { INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ restart: IP_MFILTER_FOREACH(imf, &imo->imo_head) { if ((inm = imf->imf_inm) == NULL) continue; if (inm->inm_ifp != ifp) continue; ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup_locked(inm, NULL); ip_mfilter_free(imf); goto restart; } } } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (!prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static bool in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) { return (domain == M_NODOM || domain == grp->il_numa_domain); } static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, uint16_t lport, int domain) { const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Search for an LB group match based on the following criteria: * - prefer jailed groups to non-jailed groups * - prefer exact source address matches to wildcard matches * - prefer groups bound to the specified NUMA domain */ jail_exact = jail_wild = local_exact = local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { bool injail; #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; injail = prison_flag(grp->il_cred, PR_IP4) != 0; if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, laddr) != 0) continue; if (grp->il_laddr.s_addr == laddr->s_addr) { if (injail) { jail_exact = grp; if (in_pcblookup_lb_numa_match(grp, domain)) /* This is a perfect match. */ goto out; } else if (local_exact == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_exact = grp; } } else if (grp->il_laddr.s_addr == INADDR_ANY) { if (injail) { if (jail_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) jail_wild = grp; } else if (local_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_wild = grp; } } } if (jail_exact != NULL) grp = jail_exact; else if (jail_wild != NULL) grp = jail_wild; else if (local_exact != NULL) grp = local_exact; else grp = local_wild; if (grp == NULL) return (NULL); out: return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % grp->il_inpcnt]); } static bool in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) return (false); #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) return (true); return (false); } static struct inpcb * in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { struct inpcbhead *head; struct inpcb *inp; INP_HASH_LOCK_ASSERT(pcbinfo); head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_exact) { if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) return (inp); } return (NULL); } typedef enum { INPLOOKUP_MATCH_NONE = 0, INPLOOKUP_MATCH_WILD = 1, INPLOOKUP_MATCH_LADDR = 2, } inp_lookup_match_t; static inp_lookup_match_t in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, u_short lport) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) return (INPLOOKUP_MATCH_NONE); #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) return (INPLOOKUP_MATCH_NONE); if (inp->inp_laddr.s_addr == INADDR_ANY) return (INPLOOKUP_MATCH_WILD); if (inp->inp_laddr.s_addr == laddr.s_addr) return (INPLOOKUP_MATCH_LADDR); return (INPLOOKUP_MATCH_NONE); } #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) static struct inpcb * in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport, const inp_lookup_t lockflags) { struct inpcbhead *head; struct inpcb *inp; KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), ("%s: not in SMR read section", __func__)); head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_wild) { inp_lookup_match_t match; match = in_pcblookup_wild_match(inp, laddr, lport); if (match == INPLOOKUP_MATCH_NONE) continue; if (__predict_true(inp_smr_lock(inp, lockflags))) { match = in_pcblookup_wild_match(inp, laddr, lport); if (match != INPLOOKUP_MATCH_NONE && prison_check_ip4_locked(inp->inp_cred->cr_prison, &laddr) == 0) return (inp); inp_unlock(inp, lockflags); } /* * The matching socket disappeared out from under us. Fall back * to a serialized lookup. */ return (INP_LOOKUP_AGAIN); } return (NULL); } static struct inpcb * in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { struct inpcbhead *head; struct inpcb *inp, *local_wild, *local_exact, *jail_wild; #ifdef INET6 struct inpcb *local_wild_mapped; #endif INP_HASH_LOCK_ASSERT(pcbinfo); /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; local_wild = local_exact = jail_wild = NULL; #ifdef INET6 local_wild_mapped = NULL; #endif CK_LIST_FOREACH(inp, head, inp_hash_wild) { inp_lookup_match_t match; bool injail; match = in_pcblookup_wild_match(inp, laddr, lport); if (match == INPLOOKUP_MATCH_NONE) continue; injail = prison_flag(inp->inp_cred, PR_IP4) != 0; if (injail) { if (prison_check_ip4_locked(inp->inp_cred->cr_prison, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (match == INPLOOKUP_MATCH_LADDR) { if (injail) return (inp); local_exact = inp; } else { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has either locked the hash list, which usually happens * for bind(2) operations, or is in SMR section, which happens when sorting * out incoming packets. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT(faddr.s_addr != INADDR_ANY, ("%s: invalid foreign address", __func__)); KASSERT(laddr.s_addr != INADDR_ANY, ("%s: invalid local address", __func__)); INP_HASH_WLOCK_ASSERT(pcbinfo); inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); if (inp != NULL) return (inp); if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, &laddr, lport, numa_domain); if (inp == NULL) { inp = in_pcblookup_hash_wild_locked(pcbinfo, faddr, fport, laddr, lport); } } return (inp); } static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); INP_HASH_WLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain); if (inp != NULL && !inp_trylock(inp, lockflags)) { in_pcbref(inp); INP_HASH_WUNLOCK(pcbinfo); inp_lock(inp, lockflags); if (in_pcbrele(inp, lockflags)) /* XXX-MJ or retry until we get a negative match? */ inp = NULL; } else { INP_HASH_WUNLOCK(pcbinfo); } return (inp); } static struct inpcb * in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; const u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); smr_enter(pcbinfo->ipi_smr); inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); if (inp != NULL) { if (__predict_true(inp_smr_lock(inp, lockflags))) { /* * Revalidate the 4-tuple, the socket could have been * disconnected. */ if (__predict_true(in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))) return (inp); inp_unlock(inp, lockflags); } /* * We failed to lock the inpcb, or its connection state changed * out from under us. Fall back to a precise search. */ return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, numa_domain)); } if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, &laddr, lport, numa_domain); if (inp != NULL) { if (__predict_true(inp_smr_lock(inp, lockflags))) { if (__predict_true(in_pcblookup_wild_match(inp, laddr, lport) != INPLOOKUP_MATCH_NONE)) return (inp); inp_unlock(inp, lockflags); } inp = INP_LOOKUP_AGAIN; } else { inp = in_pcblookup_hash_wild_smr(pcbinfo, faddr, fport, laddr, lport, lockflags); } if (inp == INP_LOOKUP_AGAIN) { return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, numa_domain)); } } if (inp == NULL) smr_exit(pcbinfo->ipi_smr); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp __unused) { return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, lookupflags, M_NODOM)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp __unused, struct mbuf *m) { return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, lookupflags, m->m_pkthdr.numa_domain)); } #endif /* INET */ static bool in_pcbjailed(const struct inpcb *inp, unsigned int flag) { return (prison_flag(inp->inp_cred, flag) != 0); } /* * Insert the PCB into a hash chain using ordering rules which ensure that * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. * * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs * always appear last no matter whether they are jailed. */ static void _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) { struct inpcb *last; bool bound, injail; INP_LOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); last = NULL; bound = inp->inp_laddr.s_addr != INADDR_ANY; if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } injail = in_pcbjailed(inp, PR_IP4); if (!injail) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (!in_pcbjailed(last, PR_IP4)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } else if (!CK_LIST_EMPTY(pcbhash) && !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } if (!bound) { CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { if (last->inp_laddr.s_addr == INADDR_ANY) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } if (last == NULL) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); else CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); } #ifdef INET6 /* * See the comment above _in_pcbinshash_wild(). */ static void _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) { struct inpcb *last; bool bound, injail; INP_LOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); last = NULL; bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); injail = in_pcbjailed(inp, PR_IP6); if (!injail) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (!in_pcbjailed(last, PR_IP6)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } else if (!CK_LIST_EMPTY(pcbhash) && !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } if (!bound) { CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } if (last == NULL) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); else CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); } #endif /* * Insert PCB onto various hash lists. */ int in_pcbinshash(struct inpcb *inp) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; uint32_t hash; bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); } else #endif { hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !in_nullhost(inp->inp_faddr); } if (connected) pcbhash = &pcbinfo->ipi_hash_exact[hash]; else pcbhash = &pcbinfo->ipi_hash_wild[hash]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { int error = in_pcbinslbgrouphash(inp, M_NODOM); if (error != 0) return (error); } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); if (phd == NULL) { if ((inp->inp_flags & INP_INLBGROUP) != 0) in_pcbremlbgrouphash(inp); return (ENOMEM); } phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); /* * The PCB may have been disconnected in the past. Before we can safely * make it visible in the hash table, we must wait for all readers which * may be traversing this PCB to finish. */ if (inp->inp_smr != SMR_SEQ_INVALID) { smr_wait(pcbinfo->ipi_smr, inp->inp_smr); inp->inp_smr = SMR_SEQ_INVALID; } if (connected) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) _in6_pcbinshash_wild(pcbhash, inp); else #endif _in_pcbinshash_wild(pcbhash, inp); } inp->inp_flags |= INP_INHASHLIST; return (0); } void in_pcbremhash_locked(struct inpcb *inp) { struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); MPASS(inp->inp_flags & INP_INHASHLIST); if ((inp->inp_flags & INP_INLBGROUP) != 0) in_pcbremlbgrouphash(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); } else #endif { if (in_nullhost(inp->inp_faddr)) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); } CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } inp->inp_flags &= ~INP_INHASHLIST; } static void in_pcbremhash(struct inpcb *inp) { INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremhash_locked(inp); INP_HASH_WUNLOCK(inp->inp_pcbinfo); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; uint32_t hash; bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("%s: !INP_INHASHLIST", __func__)); KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp was disconnected", __func__)); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); } else #endif { hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !in_nullhost(inp->inp_faddr); } /* * When rehashing, the caller must ensure that either the new or the old * foreign address was unspecified. */ if (connected) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); if (connected) { head = &pcbinfo->ipi_hash_exact[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); } else { head = &pcbinfo->ipi_hash_wild[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); } } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(struct inpcbinfo *pcbinfo, void (*func)(struct inpcb *, void *), void *arg) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; while ((inp = inp_next(&inpi)) != NULL) func(inp, arg); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } -struct inpcb * -so_sotoinpcb(struct socket *so) -{ - - return (sotoinpcb(so)); -} - /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)) { struct sockopt sopt; struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct sockopt_parameters *params; struct socket *so; int error; char buf[1024]; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen > sizeof(buf)) return (ENOMEM); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) return (error); if (req->newlen < sizeof(struct sockopt_parameters)) return (EINVAL); params = (struct sockopt_parameters *)buf; sopt.sopt_level = params->sop_level; sopt.sopt_name = params->sop_optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = params->sop_optval; sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); sopt.sopt_td = NULL; #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) { if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) params->sop_inc.inc6_laddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) params->sop_inc.inc6_faddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); } #endif if (params->sop_inc.inc_lport != htons(0) && params->sop_inc.inc_fport != htons(0)) { #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) inpi.hash = INP6_PCBHASH( ¶ms->sop_inc.inc6_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); else #endif inpi.hash = INP_PCBHASH( ¶ms->sop_inc.inc_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); } while ((inp = inp_next(&inpi)) != NULL) if (inp->inp_gencnt == params->sop_id) { if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } so = inp->inp_socket; KASSERT(so != NULL, ("inp_socket == NULL")); soref(so); if (params->sop_level == SOL_SOCKET) { INP_WUNLOCK(inp); error = sosetopt(so, &sopt); } else error = (*ctloutput_set)(inp, &sopt); sorele(so); break; } if (inp == NULL) error = ESRCH; return (error); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) return (EOPNOTSUPP); error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); /* * If there is already a send tag, or the INP is being torn * down, allocating a new send tag is not allowed. Else send * tags may leak. */ if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) return (EINVAL); error = m_snd_tag_alloc(ifp, ¶ms, st); #ifdef INET if (error == 0) { counter_u64_add(rate_limit_set_ok, 1); counter_u64_add(rate_limit_active, 1); } else if (error != EOPNOTSUPP) counter_u64_add(rate_limit_alloc_fail, 1); #endif return (error); } void in_pcbdetach_tag(struct m_snd_tag *mst) { m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } int in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { int error; /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new * tag if we fail to allocate one this time. */ if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { in_pcbdetach_txrtlmt(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; return (error); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #ifdef INET static void rl_init(void *st) { rate_limit_new = counter_u64_alloc(M_WAITOK); rate_limit_chg = counter_u64_alloc(M_WAITOK); rate_limit_active = counter_u64_alloc(M_WAITOK); rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); rate_limit_set_ok = counter_u64_alloc(M_WAITOK); } SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif #endif /* RATELIMIT */ diff --git a/sys/netinet/tcp_fastopen.h b/sys/netinet/tcp_fastopen.h index ee860941fdb8..f70de4bb8947 100644 --- a/sys/netinet/tcp_fastopen.h +++ b/sys/netinet/tcp_fastopen.h @@ -1,141 +1,142 @@ /*- * Copyright (c) 2015-2017 Patrick Kelsey * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _TCP_FASTOPEN_H_ #define _TCP_FASTOPEN_H_ #ifdef _KERNEL #include "opt_inet.h" #define TCP_FASTOPEN_COOKIE_LEN 8 /* SipHash24 64-bit output */ #ifdef TCP_RFC7413 VNET_DECLARE(unsigned int, tcp_fastopen_client_enable); #define V_tcp_fastopen_client_enable VNET(tcp_fastopen_client_enable) VNET_DECLARE(unsigned int, tcp_fastopen_server_enable); #define V_tcp_fastopen_server_enable VNET(tcp_fastopen_server_enable) #else #define V_tcp_fastopen_client_enable 0 #define V_tcp_fastopen_server_enable 0 #endif /* TCP_RFC7413 */ union tcp_fastopen_ip_addr { struct in_addr v4; struct in6_addr v6; }; struct tcp_fastopen_ccache_entry { TAILQ_ENTRY(tcp_fastopen_ccache_entry) cce_link; union tcp_fastopen_ip_addr cce_client_ip; /* network byte order */ union tcp_fastopen_ip_addr cce_server_ip; /* network byte order */ uint16_t server_port; /* network byte order */ uint16_t server_mss; /* host byte order */ uint8_t af; uint8_t cookie_len; uint8_t cookie[TCP_FASTOPEN_MAX_COOKIE_LEN]; sbintime_t disable_time; /* non-zero value means path is disabled */ }; struct tcp_fastopen_ccache; struct tcp_fastopen_ccache_bucket { struct mtx ccb_mtx; TAILQ_HEAD(bucket_entries, tcp_fastopen_ccache_entry) ccb_entries; int ccb_num_entries; struct tcp_fastopen_ccache *ccb_ccache; }; struct tcp_fastopen_ccache { uma_zone_t zone; struct tcp_fastopen_ccache_bucket *base; unsigned int bucket_limit; unsigned int buckets; unsigned int mask; uint32_t secret; }; +struct tcpcb; #ifdef TCP_RFC7413 void tcp_fastopen_init(void); void tcp_fastopen_destroy(void); unsigned int *tcp_fastopen_alloc_counter(void); void tcp_fastopen_decrement_counter(unsigned int *); int tcp_fastopen_check_cookie(struct in_conninfo *, uint8_t *, unsigned int, uint64_t *); void tcp_fastopen_connect(struct tcpcb *); void tcp_fastopen_disable_path(struct tcpcb *); void tcp_fastopen_update_cache(struct tcpcb *, uint16_t, uint8_t, uint8_t *); #else static __inline void tcp_fastopen_init(void) { } static __inline void tcp_fastopen_destroy(void) { } static __inline unsigned int * tcp_fastopen_alloc_counter(void) { return (NULL); } static __inline void tcp_fastopen_decrement_counter(unsigned int *_counter) { } static __inline int tcp_fastopen_check_cookie(struct in_conninfo *_inc, uint8_t *_cookie, unsigned int _len, uint64_t *_latest_cookie) { return (-1); } static __inline void tcp_fastopen_connect(struct tcpcb *_tp) { } static __inline void tcp_fastopen_disable_path(struct tcpcb *_tp) { } static __inline void tcp_fastopen_update_cache(struct tcpcb *_tp, uint16_t _mss, uint8_t _cookie_len, uint8_t *_cookie) { } #endif /* TCP_RFC7413 */ #endif /* _KERNEL */ #endif /* _TCP_FASTOPEN_H_ */ diff --git a/sys/netinet/tcp_offload.h b/sys/netinet/tcp_offload.h index b397336ab487..c76408ccd07f 100644 --- a/sys/netinet/tcp_offload.h +++ b/sys/netinet/tcp_offload.h @@ -1,53 +1,55 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef _NETINET_TCP_OFFLOAD_H_ #define _NETINET_TCP_OFFLOAD_H_ #ifndef _KERNEL #error "no user-serviceable parts inside" #endif #include extern int registered_toedevs; +struct tcpcb; + int tcp_offload_connect(struct socket *, struct sockaddr *); void tcp_offload_listen_start(struct tcpcb *); void tcp_offload_listen_stop(struct tcpcb *); void tcp_offload_input(struct tcpcb *, struct mbuf *); int tcp_offload_output(struct tcpcb *); void tcp_offload_rcvd(struct tcpcb *); void tcp_offload_ctloutput(struct tcpcb *, int, int); void tcp_offload_tcp_info(const struct tcpcb *, struct tcp_info *); int tcp_offload_alloc_tls_session(struct tcpcb *, struct ktls_session *, int); void tcp_offload_detach(struct tcpcb *); void tcp_offload_pmtu_update(struct tcpcb *, tcp_seq, int); #endif diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 3e24db552618..91fafdec672a 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -1,745 +1,722 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition * to SOCK_CLOEXEC and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x00000001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ #define SO_REUSEADDR 0x00000004 /* allow local address reuse */ #define SO_KEEPALIVE 0x00000008 /* keep connections alive */ #define SO_DONTROUTE 0x00000010 /* just use interface addresses */ #define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x00000080 /* linger on close if data present */ #define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ #define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ #define SO_RERROR 0x00020000 /* keep track of receive errors */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #define SO_DOMAIN 0x1019 /* get socket domain */ #endif #if __BSD_VISIBLE #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_NETLINK 38 /* Netlink protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_HYPERV 43 /* HyperV sockets */ #define AF_DIVERT 44 /* divert(4) */ #define AF_MAX 44 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_DIVERT AF_DIVERT #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ #define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials (LOCAL_CREDS). */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) /* * Socket credentials (LOCAL_CREDS_PERSISTENT). */ struct sockcred2 { int sc_version; /* version of this structure */ pid_t sc_pid; /* PID of sending process */ uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; #define SOCKCRED2SIZE(ngrps) \ (sizeof(struct sockcred2) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #define SCM_TIME_INFO 0x07 /* timestamp info */ #define SCM_CREDS2 0x08 /* process creds (struct sockcred2) */ struct sock_timestamp_info { __uint32_t st_info_flags; __uint32_t st_info_pad0; __uint64_t st_info_rsv[7]; }; #define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ #define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted on entrance */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ enum shutdown_how { SHUT_RD = 0, /* shut down the reading side */ SHUT_WR, /* shut down the writing side */ SHUT_RDWR /* shut down both sides */ }; #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; #endif /* __BSD_VISIBLE */ #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; -struct inpcb *so_sotoinpcb(struct socket *so); -struct sockbuf *so_sockbuf_snd(struct socket *); -struct sockbuf *so_sockbuf_rcv(struct socket *); - -int so_state_get(const struct socket *); -void so_state_set(struct socket *, int); - int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); - -int so_linger_get(const struct socket *); -void so_linger_set(struct socket *, int); - -struct protosw *so_protosw_get(const struct socket *); -void so_protosw_set(struct socket *, struct protosw *); - -void so_sorwakeup_locked(struct socket *so); -void so_sowwakeup_locked(struct socket *so); - -void so_sorwakeup(struct socket *so); -void so_sowwakeup(struct socket *so); - -void so_lock(struct socket *so); -void so_unlock(struct socket *so); - #endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */