diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index e53b0367960b..52f4b6cdf7f9 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -1,834 +1,834 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td); if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(uio->uio_td->td_proc); tdsignal(uio->uio_td, SIGPIPE); PROC_UNLOCK(uio->uio_td->td_proc); } return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags |= SB_ASYNC; so->sol_sbsnd_flags |= SB_ASYNC; } else { SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags &= ~SB_ASYNC; so->sol_sbsnd_flags &= ~SB_ASYNC; } else { SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); } break; case FIONREAD: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_rcv); } break; case FIONWRITE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_snd); } break; case FIONSPACE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) { *(int *)data = 0; } else { *(int *)data = sbspace(&so->so_snd); } } break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; } break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); - error = rtioctl_fib(cmd, data, so->so_fibnum); + error = rtioctl_fib(cmd, data, so->so_fibnum, td); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = ((*so->so_proto->pr_usrreqs->pru_control) (so, cmd, data, 0, td)); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error; bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif SOCK_LOCK(so); if (!SOLISTENING(so)) { struct sockbuf *sb; /* * If SBS_CANTRCVMORE is set, but there's still data left * in the receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCKBUF_UNLOCK(sb); sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCKBUF_UNLOCK(sb); } ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; error = so->so_proto->pr_usrreqs->pru_sense(so, ub); SOCK_UNLOCK(so); return (error); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; CURVNET_SET(so->so_vnet); kif->kf_un.kf_sock.kf_sock_domain0 = so->so_proto->pr_domain->dom_family; kif->kf_un.kf_sock.kf_sock_type0 = so->so_type; kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_un.kf_sock.kf_sock_domain0) { case AF_INET: case AF_INET6: if (kif->kf_un.kf_sock.kf_sock_protocol0 == IPPROTO_TCP) { if (so->so_pcb != NULL) { inpcb = (struct inpcb *)(so->so_pcb); kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)inpcb->inp_ppcb; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; } error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len); free(sa, M_SONAME); } error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); CURVNET_RESTORE(); return (0); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * completed by the AIO job so far. */ #define aio_done backend3 static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); if (soaio_target_procs > 0) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct file *fp; size_t cnt, done, job_total_nbytes; long ru_before; int error, flags; SOCKBUF_UNLOCK(sb); aio_switch_vmspace(job); td = curthread; fp = job->fd_file; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; job_total_nbytes = job->uiop->uio_resid + job->aio_done; done = job->aio_done; cnt = job->uiop->uio_resid; job->uiop->uio_offset = 0; job->uiop->uio_td = td; flags = MSG_NBIO; /* * For resource usage accounting, only count a completed request * as a single message to avoid counting multiple calls to * sosend/soreceive on a blocking socket. */ if (sb == &so->so_rcv) { ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, job->uiop, NULL, NULL, &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sosend(so, NULL, job->uiop, NULL, NULL, flags, td); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } done += cnt - job->uiop->uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; if (error == EWOULDBLOCK) { /* * The request was either partially completed or not * completed at all due to racing with a read() or * write() on the socket. If the socket is * non-blocking, return with any partial completion. * If the socket is blocking or if no progress has * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ MPASS(done != job_total_nbytes); SOCKBUF_LOCK(sb); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; if (soaio_ready(so, sb)) { empty_retries++; SOCKBUF_UNLOCK(sb); goto retry; } if (!aio_set_cancel_function(job, soo_aio_cancel)) { SOCKBUF_UNLOCK(sb); if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); SOCKBUF_LOCK(sb); } else { TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); } return; } SOCKBUF_UNLOCK(sb); } if (done != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error) aio_complete(job, -1, error); else aio_complete(job, done, 0); SOCKBUF_LOCK(sb); } static void soaio_process_sb(struct socket *so, struct sockbuf *sb) { struct kaiocb *job; CURVNET_SET(so->so_vnet); SOCKBUF_LOCK(sb); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, sb, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_rcv); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_snd); } void sowakeup_aio(struct socket *so, struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; soref(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; long done; int opcode; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode & LIO_READ) sb = &so->so_rcv; else { MPASS(opcode & LIO_WRITE); sb = &so->so_snd; } SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(sb); done = job->aio_done; if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; int error; so = fp->f_data; error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job); if (error == 0) return (0); switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { case LIO_READ: sb = &so->so_rcv; break; case LIO_WRITE: sb = &so->so_snd; break; default: return (EINVAL); } SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, sb); else sb->sb_flags |= SB_AIO; } SOCKBUF_UNLOCK(sb); return (0); } diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 7f06b51cf096..92a204aafef0 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4415 +1,4419 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include +#include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); + if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) + return (ECAPMODE); + if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(so->so_count == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ refcount_release(&so->so_count); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); soabort(sp); } else /* sp is now in sofree() */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { /* * Note that error is intentionally * ignored. * * Like sendfile(), we rely on the * completion routine (pru_ready()) * to free the mbufs in the event that * pru_send() encountered an error and * did not append them to the sockbuf. */ soref(so); ktls_enqueue(top, so, tls_enq_cnt); } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for a non-application data * record. If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* This will need to change for TLS 1.3. */ if (tgr.tls_type != TLS_RLTYPE_APP) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmappedtouio(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); sbunlock(sb); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } /* Not the last one, as so holds a ref. */ refcount_release(&head->so_count); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 4466ae8822cd..3f7198c2f3ae 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1,2995 +1,2996 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All Rights Reserved. * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved. * Copyright (c) 2018 Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. * * The implementation is substantially complicated by the fact that * "ancillary data", such as file descriptors or credentials, may be passed * across UNIX domain sockets. The potential for passing UNIX domain sockets * over other UNIX domain sockets requires the implementation of a simple * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: * RDM * rethink name space problems * need a proper out-of-band */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include MALLOC_DECLARE(M_FILECAPS); /* * See unpcb.h for the locking key. */ static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; /* (l) */ static u_int unp_count; /* (l) Count of local sockets. */ static ino_t unp_ino; /* Prototype for fake inode numbers. */ static int unp_rights; /* (g) File descriptors in flight. */ static struct unp_head unp_shead; /* (l) List of stream sockets. */ static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ struct unp_defer { SLIST_ENTRY(unp_defer) ud_link; struct file *ud_fp; }; static SLIST_HEAD(, unp_defer) unp_defers; static int unp_defers_count; static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct timeout_task unp_gc_task; /* * The close of unix domain sockets attached as SCM_RIGHTS is * postponed to the taskqueue, to avoid arbitrary recursion depth. * The attached sockets might have another sockets attached. */ static struct task unp_defer_task; /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ static u_long unpsp_recvspace = PIPSIZ; static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Local domain"); static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_STREAM"); static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_DGRAM"); static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SOCK_SEQPACKET"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, "Default stream send space."); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, "Default stream receive space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, "Default datagram send space."); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, "Default datagram receive space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, &unpsp_sendspace, 0, "Default seqpacket send space."); SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, &unpsp_recvspace, 0, "Default seqpacket receive space."); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "File descriptors in flight."); SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, &unp_defers_count, 0, "File descriptors deferred to taskqueue for close."); /* * Locking and synchronization: * * Several types of locks exist in the local domain socket implementation: * - a global linkage lock * - a global connection list lock * - the mtxpool lock * - per-unpcb mutexes * * The linkage lock protects the global socket lists, the generation number * counter and garbage collector state. * * The connection list lock protects the list of referring sockets in a datagram * socket PCB. This lock is also overloaded to protect a global list of * sockets whose buffers contain socket references in the form of SCM_RIGHTS * messages. To avoid recursion, such references are released by a dedicated * thread. * * The mtxpool lock protects the vnode from being modified while referenced. * Lock ordering rules require that it be acquired before any PCB locks. * * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the * unpcb. This includes the unp_conn field, which either links two connected * PCBs together (for connected socket types) or points at the destination * socket (for connectionless socket types). The operations of creating or * destroying a connection therefore involve locking multiple PCBs. To avoid * lock order reversals, in some cases this involves dropping a PCB lock and * using a reference counter to maintain liveness. * * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, * allocated in pru_attach() and freed in pru_detach(). The validity of that * pointer is an invariant, so no lock is required to dereference the so_pcb * pointer if a valid socket reference is held by the caller. In practice, * this is always true during operations performed on a socket. Each unpcb * has a back-pointer to its socket, unp_socket, which will be stable under * the same circumstances. * * This pointer may only be safely dereferenced as long as a valid reference * to the unpcb is held. Typically, this reference will be from the socket, * or from another unpcb when the referring unpcb's lock is held (in order * that the reference not be invalidated during use). For example, to follow * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee * that detach is not run clearing unp_socket. * * Blocking with UNIX domain sockets is a tricky issue: unlike most network * protocols, bind() is a non-atomic operation, and connect() requires * potential sleeping in the protocol, due to potentially waiting on local or * distributed file systems. We try to separate "lookup" operations, which * may sleep, and the IPC operations themselves, which typically can occur * with relative atomicity as locks can be held over the entire operation. * * Another tricky issue is simultaneous multi-threaded or multi-process * access to a single UNIX domain socket. These are handled by the flags * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or * binding, both of which involve dropping UNIX domain socket locks in order * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ "unp_link_rwlock") #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_LOCKED) #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_UNLOCKED) #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock) #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock) #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock) #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock) #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK(); #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK(); #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ "unp", "unp", \ MTX_DUPOK|MTX_DEF) #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx) #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx) #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx) #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED) static int uipc_connect2(struct socket *, struct socket *); static int uipc_ctloutput(struct socket *, struct sockopt *); static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connectat(int, struct socket *, struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_dispose(struct socket *so); static void unp_dispose_mbuf(struct mbuf *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); static void unp_discard(struct file *); static void unp_freerights(struct filedescent **, int); static void unp_init(void); static int unp_internalize(struct mbuf **, struct thread *); static void unp_internalize_fp(struct file *); static int unp_externalize(struct mbuf *, struct mbuf **, int); static int unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *, int); static void unp_process_defers(void * __unused, int); static void unp_pcb_hold(struct unpcb *unp) { u_int old __unused; old = refcount_acquire(&unp->unp_refcount); KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp)); } static __result_use_check bool unp_pcb_rele(struct unpcb *unp) { bool ret; UNP_PCB_LOCK_ASSERT(unp); if ((ret = refcount_release(&unp->unp_refcount))) { UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } return (ret); } static void unp_pcb_rele_notlast(struct unpcb *unp) { bool ret __unused; ret = refcount_release(&unp->unp_refcount); KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp)); } static void unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (unp == unp2) { UNP_PCB_LOCK(unp); } else if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp); UNP_PCB_LOCK(unp2); } else { UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); } } static void unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2) { UNP_PCB_UNLOCK(unp); if (unp != unp2) UNP_PCB_UNLOCK(unp2); } /* * Try to lock the connected peer of an already locked socket. In some cases * this requires that we unlock the current socket. The pairbusy counter is * used to block concurrent connection attempts while the lock is dropped. The * caller must be careful to revalidate PCB state. */ static struct unpcb * unp_pcb_lock_peer(struct unpcb *unp) { struct unpcb *unp2; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if (unp2 == NULL) return (NULL); if (__predict_false(unp == unp2)) return (unp); UNP_PCB_UNLOCK_ASSERT(unp2); if (__predict_true(UNP_PCB_TRYLOCK(unp2))) return (unp2); if ((uintptr_t)unp2 > (uintptr_t)unp) { UNP_PCB_LOCK(unp2); return (unp2); } unp->unp_pairbusy++; unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp); KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL, ("%s: socket %p was reconnected", __func__, unp)); if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) { unp->unp_flags &= ~UNP_WAITING; wakeup(unp); } if (unp_pcb_rele(unp2)) { /* unp2 is unlocked. */ return (NULL); } if (unp->unp_conn == NULL) { UNP_PCB_UNLOCK(unp2); return (NULL); } return (unp2); } /* * Definitions of protocols supported in the LOCAL domain. */ static struct domain localdomain; static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream; static struct pr_usrreqs uipc_usrreqs_seqpacket; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &localdomain, - .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, + .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS| + PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_stream }, { .pr_type = SOCK_DGRAM, .pr_domain = &localdomain, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS|PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_dgram }, { .pr_type = SOCK_SEQPACKET, .pr_domain = &localdomain, /* * XXXRW: For now, PR_ADDR because soreceive will bump into them * due to our use of sbappendaddr. A new sbappend variants is needed * that supports both atomic record writes and control data. */ - .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD| - PR_RIGHTS, + .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED| + PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs_seqpacket, }, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_init = unp_init, .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_protosw = localsw, .dom_protoswNPROTOSW = &localsw[nitems(localsw)] }; DOMAIN_SET(local); static void uipc_abort(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_PCB_UNLOCK_ASSERT(unp); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { unp_pcb_hold(unp2); UNP_PCB_UNLOCK(unp); unp_drop(unp2); } else UNP_PCB_UNLOCK(unp); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); unp2 = unp_pcb_lock_peer(unp); if (unp2 != NULL && unp2->unp_addr != NULL) sa = (struct sockaddr *)unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); if (unp2 != NULL) unp_pcb_unlock_pair(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; int error; bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: sendspace = unpst_sendspace; recvspace = unpst_recvspace; break; case SOCK_DGRAM: sendspace = unpdg_sendspace; recvspace = unpdg_recvspace; break; case SOCK_SEQPACKET: sendspace = unpsp_sendspace; recvspace = unpsp_recvspace; break; default: panic("uipc_attach"); } error = soreserve(so, sendspace, recvspace); if (error) return (error); } unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); UNP_PCB_LOCK_INIT(unp); unp->unp_socket = so; so->so_pcb = unp; refcount_init(&unp->unp_refcount, 1); if ((locked = UNP_LINK_WOWNED()) == false) UNP_LINK_WLOCK(); unp->unp_gencnt = ++unp_gencnt; unp->unp_ino = ++unp_ino; unp_count++; switch (so->so_type) { case SOCK_STREAM: LIST_INSERT_HEAD(&unp_shead, unp, unp_link); break; case SOCK_DGRAM: LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); break; case SOCK_SEQPACKET: LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); break; default: panic("uipc_attach"); } if (locked == false) UNP_LINK_WUNLOCK(); return (0); } static int uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; cap_rights_t rights; char *buf; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); if (soun->sun_len > sizeof(struct sockaddr_un)) return (EINVAL); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding simplifies the * implementation and avoids a great many possible failure modes. */ UNP_PCB_LOCK(unp); if (unp->unp_vnode != NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_PCB_UNLOCK(unp); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); bcopy(soun->sun_path, buf, namelen); buf[namelen] = 0; restart: NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT), td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) { VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; goto error; } vp = nd.ni_vp; ASSERT_VOP_ELOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_PCB_LOCK(unp); VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); vref(vp); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); free(buf, M_TEMP); return (0); error: UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); free(buf, M_TEMP); return (error); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (uipc_bindat(AT_FDCWD, so, nam, td)); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); error = unp_connect(so, nam, td); return (error); } static int uipc_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connectat: td != curthread")); error = unp_connectat(fd, so, nam, td); return (error); } static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; struct vnode *vp = NULL; struct mtx *vplock; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); vplock = NULL; if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (vp && unp->unp_vnode == NULL) { mtx_unlock(vplock); vp = NULL; } if (vp != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } } static int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp, *unp2; int error; unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); unp2 = so2->so_pcb; KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); unp_pcb_lock_pair(unp, unp2); error = unp_connect2(so1, so2, PRU_CONNECT2); unp_pcb_unlock_pair(unp, unp2); return (error); } static void uipc_detach(struct socket *so) { struct unpcb *unp, *unp2; struct mtx *vplock; struct vnode *vp; int local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); vp = NULL; vplock = NULL; SOCK_LOCK(so); if (!SOLISTENING(so)) { /* * Once the socket is removed from the global lists, * uipc_ready() will not be able to locate its socket buffer, so * clear the buffer now. At this point internalized rights have * already been disposed of. */ sbrelease(&so->so_rcv, so); } SOCK_UNLOCK(so); UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); if (unp->unp_gcflag & UNPGC_DEAD) LIST_REMOVE(unp, unp_dead); unp->unp_gencnt = ++unp_gencnt; --unp_count; UNP_LINK_WUNLOCK(); UNP_PCB_UNLOCK_ASSERT(unp); restart: if ((vp = unp->unp_vnode) != NULL) { vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); } UNP_PCB_LOCK(unp); if (unp->unp_vnode != vp && unp->unp_vnode != NULL) { if (vplock) mtx_unlock(vplock); UNP_PCB_UNLOCK(unp); goto restart; } if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; } if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); UNP_REF_LIST_LOCK(); while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); unp_pcb_hold(ref); UNP_REF_LIST_UNLOCK(); MPASS(ref != unp); UNP_PCB_UNLOCK_ASSERT(ref); unp_drop(ref); UNP_REF_LIST_LOCK(); } UNP_REF_LIST_UNLOCK(); UNP_PCB_LOCK(unp); local_unp_rights = unp_rights; unp->unp_socket->so_pcb = NULL; unp->unp_socket = NULL; free(unp->unp_addr, M_SONAME); unp->unp_addr = NULL; if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (vp) { mtx_unlock(vplock); vrele(vp); } if (local_unp_rights) taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) unp_disconnect(unp, unp2); else UNP_PCB_UNLOCK(unp); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) return (EOPNOTSUPP); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); UNP_PCB_LOCK(unp); if (unp->unp_vnode == NULL) { /* Already connected or not bound to an address. */ error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ; UNP_PCB_UNLOCK(unp); return (error); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2xt(td, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LINK_RLOCK(); /* * XXX: It seems that this test always fails even when connection is * established. So, this else clause is added as workaround to * return PF_LOCAL sockaddr. */ unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); if (unp2->unp_addr != NULL) sa = (struct sockaddr *) unp2->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_LINK_RUNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); /* * Adjust backpressure on sender and wakeup any waiting to write. * * The unp lock is acquired to maintain the validity of the unp_conn * pointer; no lock on unp2 is required as unp2->unp_socket will be * static as long as we don't permit unp2 to disconnect from unp, * which is prevented by the lock on unp. We cache values from * so_rcv to avoid holding the so_rcv lock over the entire * transaction on the remote so_snd. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = sbavail(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); /* * There is a benign race condition at this point. If we're planning to * clear SB_STOP, but uipc_send is called on the connected socket at * this instant, it might add data to the sockbuf and set SB_STOP. Then * we would erroneously clear SB_STOP below, even though the sockbuf is * full. The race is benign because the only ill effect is to allow the * sockbuf to exceed its size limit, and the size limits are not * strictly guaranteed anyway. */ UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); return (0); } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_snd); if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax) so2->so_snd.sb_flags &= ~SB_STOP; sowwakeup_locked(so2); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; int freed, error; unp = sotounpcb(so); KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM || so->so_type == SOCK_SEQPACKET, ("%s: socktype %d", __func__, so->so_type)); freed = error = 0; if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; unp2 = NULL; switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; if (nam != NULL) { error = unp_connect(so, nam, td); if (error != 0) break; } UNP_PCB_LOCK(unp); /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ unp2 = unp_pcb_lock_peer(unp); if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; break; } if (unp2->unp_flags & UNP_WANTCRED_MASK) control = unp_addsockcred(td, control, unp2->unp_flags); if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) unp_disconnect(unp, unp2); else unp_pcb_unlock_pair(unp, unp2); break; } case SOCK_SEQPACKET: case SOCK_STREAM: if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); if (error != 0) break; } else { error = ENOTCONN; break; } } UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) { UNP_PCB_UNLOCK(unp); error = ENOTCONN; break; } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) { unp_pcb_unlock_pair(unp, unp2); error = EPIPE; break; } UNP_PCB_UNLOCK(unp); if ((so2 = unp2->unp_socket) == NULL) { UNP_PCB_UNLOCK(unp2); error = ENOTCONN; break; } SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED_MASK) { /* * Credentials are passed only once on SOCK_STREAM and * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS). */ control = unp_addsockcred(td, control, unp2->unp_flags); unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT; } /* * Send to paired receive port and wake up readers. Don't * check for space available in the receive buffer if we're * attaching ancillary data; Unix domain sockets only check * for space in the sending sockbuf, and that check is * performed one level up the stack. At that level we cannot * precisely account for the amount of buffer space used * (e.g., because control messages are not yet internalized). */ switch (so->so_type) { case SOCK_STREAM: if (control != NULL) { sbappendcontrol_locked(&so2->so_rcv, m, control, flags); control = NULL; } else sbappend_locked(&so2->so_rcv, m, flags); break; case SOCK_SEQPACKET: if (sbappendaddr_nospacecheck_locked(&so2->so_rcv, &sun_noname, m, control)) control = NULL; break; } mbcnt = so2->so_rcv.sb_mbcnt; sbcc = sbavail(&so2->so_rcv); if (sbcc) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); /* * The PCB lock on unp2 protects the SB_STOP flag. Without it, * it would be possible for uipc_rcvd to be called at this * point, drain the receiving sockbuf, clear SB_STOP, and then * we would set SB_STOP below. That could lead to an empty * sockbuf having SB_STOP set */ SOCKBUF_LOCK(&so->so_snd); if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax) so->so_snd.sb_flags |= SB_STOP; SOCKBUF_UNLOCK(&so->so_snd); UNP_PCB_UNLOCK(unp2); m = NULL; break; } /* * PRUS_EOF is equivalent to pru_send followed by pru_shutdown. */ if (flags & PRUS_EOF) { UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); } if (control != NULL && error != 0) unp_dispose_mbuf(control); release: if (control != NULL) m_freem(control); /* * In case of PRUS_NOTREADY, uipc_ready() is responsible * for freeing memory. */ if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); return (error); } static bool uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp) { struct mbuf *mb, *n; struct sockbuf *sb; SOCK_LOCK(so); if (SOLISTENING(so)) { SOCK_UNLOCK(so); return (false); } mb = NULL; sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (sb->sb_fnrdy != NULL) { for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) { if (mb == m) { *errorp = sbready(sb, m, count); break; } mb = mb->m_next; if (mb == NULL) { mb = n; if (mb != NULL) n = mb->m_nextpkt; } } } SOCKBUF_UNLOCK(sb); SOCK_UNLOCK(so); return (mb != NULL); } static int uipc_ready(struct socket *so, struct mbuf *m, int count) { struct unpcb *unp, *unp2; struct socket *so2; int error, i; unp = sotounpcb(so); KASSERT(so->so_type == SOCK_STREAM, ("%s: unexpected socket type for %p", __func__, so)); UNP_PCB_LOCK(unp); if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { UNP_PCB_UNLOCK(unp); so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if ((error = sbready(&so2->so_rcv, m, count)) == 0) sorwakeup_locked(so2); else SOCKBUF_UNLOCK(&so2->so_rcv); UNP_PCB_UNLOCK(unp2); return (error); } UNP_PCB_UNLOCK(unp); /* * The receiving socket has been disconnected, but may still be valid. * In this case, the now-ready mbufs are still present in its socket * buffer, so perform an exhaustive search before giving up and freeing * the mbufs. */ UNP_LINK_RLOCK(); LIST_FOREACH(unp, &unp_shead, unp_link) { if (uipc_ready_scan(unp->unp_socket, m, count, &error)) break; } UNP_LINK_RUNLOCK(); if (unp == NULL) { for (i = 0; i < count; i++) m = m_free(m); error = ECONNRESET; } return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); sb->st_blksize = so->so_snd.sb_hiwat; sb->st_dev = NODEV; sb->st_ino = unp->unp_ino; return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp); return (0); } static struct pr_usrreqs uipc_usrreqs_dgram = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_dgram, .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_seqpacket = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, /* XXX: or...? */ .pru_close = uipc_close, }; static struct pr_usrreqs uipc_usrreqs_stream = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_bindat = uipc_bindat, .pru_connect = uipc_connect, .pru_connectat = uipc_connectat, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_ready = uipc_ready, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_soreceive = soreceive_generic, .pru_close = uipc_close, }; static int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != SOL_LOCAL) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CREDS_PERSISTENT: /* Unlocked read. */ optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: /* Unlocked read. */ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CREDS_PERSISTENT: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit, exclusive) do { \ UNP_PCB_LOCK(unp); \ if (optval) { \ if ((unp->unp_flags & (exclusive)) != 0) { \ UNP_PCB_UNLOCK(unp); \ error = EINVAL; \ break; \ } \ unp->unp_flags |= (bit); \ } else \ unp->unp_flags &= ~(bit); \ UNP_PCB_UNLOCK(unp); \ } while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS); break; case LOCAL_CREDS_PERSISTENT: OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT, 0); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (unp_connectat(AT_FDCWD, so, nam, td)); } static int unp_connectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { struct mtx *vplock; struct sockaddr_un *soun; struct vnode *vp; struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; cap_rights_t rights; int error, len; bool connreq; if (nam->sa_family != AF_UNIX) return (EAFNOSUPPORT); if (nam->sa_len > sizeof(struct sockaddr_un)) return (EINVAL); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); soun = (struct sockaddr_un *)nam; bcopy(soun->sun_path, buf, len); buf[len] = 0; unp = sotounpcb(so); UNP_PCB_LOCK(unp); for (;;) { /* * Wait for connection state to stabilize. If a connection * already exists, give up. For datagram sockets, which permit * multiple consecutive connect(2) calls, upper layers are * responsible for disconnecting in advance of a subsequent * connect(2), but this is not synchronized with PCB connection * state. * * Also make sure that no threads are currently attempting to * lock the peer socket, to ensure that unp_conn cannot * transition between two valid sockets while locks are dropped. */ if (unp->unp_conn != NULL) { UNP_PCB_UNLOCK(unp); return (EISCONN); } if ((unp->unp_flags & UNP_CONNECTING) != 0) { UNP_PCB_UNLOCK(unp); return (EALREADY); } if (unp->unp_pairbusy > 0) { unp->unp_flags |= UNP_WAITING; mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0); continue; } break; } unp->unp_flags |= UNP_CONNECTING; UNP_PCB_UNLOCK(unp); connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0; if (connreq) sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); else sa = NULL; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT), td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); NDFREE_NOTHING(&nd); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); if (error) goto bad; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp2); if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } if (connreq) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else so2 = NULL; if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } unp3 = sotounpcb(so2); unp_pcb_lock_pair(unp2, unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } unp_copy_peercred(td, unp3, unp, unp2); UNP_PCB_UNLOCK(unp2); unp2 = unp3; /* * It is safe to block on the PCB lock here since unp2 is * nascent and cannot be connected to any other sockets. */ UNP_PCB_LOCK(unp); #ifdef MAC mac_socketpeer_set_from_socket(so, so2); mac_socketpeer_set_from_socket(so2, so); #endif } else { unp_pcb_lock_pair(unp, unp2); } KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && sotounpcb(so2) == unp2, ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); unp_pcb_unlock_pair(unp, unp2); bad2: mtx_unlock(vplock); bad: if (vp != NULL) { vput(vp); } free(sa, M_SONAME); UNP_PCB_LOCK(unp); KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); unp->unp_flags &= ~UNP_CONNECTING; UNP_PCB_UNLOCK(unp); return (error); } /* * Set socket peer credentials at connection time. * * The client's PCB credentials are copied from its process structure. The * server's PCB credentials are copied from the socket on which it called * listen(2). uipc_listen cached that process's credentials at the time. */ void unp_copy_peercred(struct thread *td, struct unpcb *client_unp, struct unpcb *server_unp, struct unpcb *listen_unp) { cru2xt(td, &client_unp->unp_peercred); client_unp->unp_flags |= UNP_HAVEPC; memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, sizeof(server_unp->unp_peercred)); server_unp->unp_flags |= UNP_HAVEPC; client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { struct unpcb *unp; struct unpcb *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp)); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp->unp_conn = unp2; unp_pcb_hold(unp2); unp_pcb_hold(unp); switch (so->so_type) { case SOCK_DGRAM: UNP_REF_LIST_LOCK(); LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); UNP_REF_LIST_UNLOCK(); soisconnected(so); break; case SOCK_STREAM: case SOCK_SEQPACKET: KASSERT(unp2->unp_conn == NULL, ("%s: socket %p is already connected", __func__, unp2)); unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { struct socket *so, *so2; #ifdef INVARIANTS struct unpcb *unptmp; #endif UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); KASSERT(unp->unp_conn == unp2, ("%s: unpcb %p is not connected to %p", __func__, unp, unp2)); unp->unp_conn = NULL; so = unp->unp_socket; so2 = unp2->unp_socket; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: UNP_REF_LIST_LOCK(); #ifdef INVARIANTS LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) { if (unptmp == unp) break; } KASSERT(unptmp != NULL, ("%s: %p not found in reflist of %p", __func__, unp, unp2)); #endif LIST_REMOVE(unp, unp_reflink); UNP_REF_LIST_UNLOCK(); if (so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); } break; case SOCK_STREAM: case SOCK_SEQPACKET: if (so) soisdisconnected(so); MPASS(unp2->unp_conn == unp); unp2->unp_conn = NULL; if (so2) soisdisconnected(so2); break; } if (unp == unp2) { unp_pcb_rele_notlast(unp); if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); } else { if (!unp_pcb_rele(unp)) UNP_PCB_UNLOCK(unp); if (!unp_pcb_rele(unp2)) UNP_PCB_UNLOCK(unp2); } } /* * unp_pcblist() walks the global list of struct unpcb's to generate a * pointer list, bumping the refcount on each unpcb. It then copies them out * sequentially, validating the generation number on each to see if it has * been detached. All of this is necessary because copyout() may sleep on * disk I/O. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; u_int i; int error, n; switch ((intptr_t)arg1) { case SOCK_STREAM: head = &unp_shead; break; case SOCK_DGRAM: head = &unp_dhead; break; case SOCK_SEQPACKET: head = &unp_sphead; break; default: panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); } /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO); UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) { UNP_PCB_UNLOCK(unp); continue; } unp_list[i++] = unp; unp_pcb_hold(unp); } UNP_PCB_UNLOCK(unp); } UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; UNP_PCB_LOCK(unp); if (unp_pcb_rele(unp)) continue; if (unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = (uintptr_t)unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); else bzero(&xu->xu_addr, sizeof(xu->xu_addr)); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); else bzero(&xu->xu_caddr, sizeof(xu->xu_caddr)); xu->unp_vnode = (uintptr_t)unp->unp_vnode; xu->unp_conn = (uintptr_t)unp->unp_conn; xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs); xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink); xu->unp_gencnt = unp->unp_gencnt; sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { UNP_PCB_UNLOCK(unp); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", "List of active local seqpacket sockets"); static void unp_shutdown(struct unpcb *unp) { struct unpcb *unp2; struct socket *so; UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if ((unp->unp_socket->so_type == SOCK_STREAM || (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); } } static void unp_drop(struct unpcb *unp) { struct socket *so = unp->unp_socket; struct unpcb *unp2; /* * Regardless of whether the socket's peer dropped the connection * with this socket by aborting or disconnecting, POSIX requires * that ECONNRESET is returned. */ UNP_PCB_LOCK(unp); if (so) so->so_error = ECONNRESET; if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { /* Last reference dropped in unp_disconnect(). */ unp_pcb_rele_notlast(unp); unp_disconnect(unp, unp2); } else if (!unp_pcb_rele(unp)) { UNP_PCB_UNLOCK(unp); } } static void unp_freerights(struct filedescent **fdep, int fdcount) { struct file *fp; int i; KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount)); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; filecaps_free(&fdep[i]->fde_caps); unp_discard(fp); } free(fdep[0], M_FILECAPS); } static int unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct filedesc *fdesc = td->td_proc->p_fd; struct filedescent **fdep; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(*fdep); if (newfds == 0) goto next; fdep = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(fdep, newfds); goto next; } FILEDESC_XLOCK(fdesc); /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_XUNLOCK(fdesc); error = E2BIG; unp_freerights(fdep, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); if (fdallocn(td, 0, fdp, newfds) != 0) { FILEDESC_XUNLOCK(fdesc); error = EMSGSIZE; unp_freerights(fdep, newfds); m_freem(*controlp); *controlp = NULL; goto next; } for (i = 0; i < newfds; i++, fdp++) { _finstall(fdesc, fdep[i]->fde_file, *fdp, (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0, &fdep[i]->fde_caps); unp_externalize_fp(fdep[i]->fde_file); } /* * The new type indicates that the mbuf data refers to * kernel resources that may need to be released before * the mbuf is freed. */ m_chtype(*controlp, MT_EXTCONTROL); FILEDESC_XUNLOCK(fdesc); free(fdep[0], M_FILECAPS); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } #ifdef INVARIANTS static void unp_zdtor(void *mem, int size __unused, void *arg __unused) { struct unpcb *unp; unp = mem; KASSERT(LIST_EMPTY(&unp->unp_refs), ("%s: unpcb %p has lingering refs", __func__, unp)); KASSERT(unp->unp_socket == NULL, ("%s: unpcb %p has socket backpointer", __func__, unp)); KASSERT(unp->unp_vnode == NULL, ("%s: unpcb %p has vnode references", __func__, unp)); KASSERT(unp->unp_conn == NULL, ("%s: unpcb %p is still connected", __func__, unp)); KASSERT(unp->unp_addr == NULL, ("%s: unpcb %p has leaked addr", __func__, unp)); } #endif static void unp_init(void) { uma_dtor dtor; #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) return; #endif #ifdef INVARIANTS dtor = unp_zdtor; #else dtor = NULL; #endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(unp_zone, maxsockets); uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); LIST_INIT(&unp_sphead); SLIST_INIT(&unp_defers); TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); UNP_LINK_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } static void unp_internalize_cleanup_rights(struct mbuf *control) { struct cmsghdr *cp; struct mbuf *m; void *data; socklen_t datalen; for (m = control; m != NULL; m = m->m_next) { cp = mtod(m, struct cmsghdr *); if (cp->cmsg_level != SOL_SOCKET || cp->cmsg_type != SCM_RIGHTS) continue; data = CMSG_DATA(cp); datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; unp_freerights(data, datalen / sizeof(struct filedesc *)); } } static int unp_internalize(struct mbuf **controlp, struct thread *td) { struct mbuf *control, **initial_controlp; struct proc *p; struct filedesc *fdesc; struct bintime *bt; struct cmsghdr *cm; struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; void *data; socklen_t clen, datalen; int i, j, error, *fdp, oldfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); p = td->td_proc; fdesc = p->p_fd; error = 0; control = *controlp; clen = control->m_len; *controlp = NULL; initial_controlp = controlp; for (cm = mtod(control, struct cmsghdr *); cm != NULL;) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); if (oldfds == 0) break; /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_SLOCK(fdesc); for (i = 0; i < oldfds; i++, fdp++) { fp = fget_locked(fdesc, *fdp); if (fp == NULL) { FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdesc); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to the * file structure and capability rights. */ newlen = oldfds * sizeof(fdep[0]); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_SUNLOCK(fdesc); error = E2BIG; goto out; } fdp = data; for (i = 0; i < oldfds; i++, fdp++) { if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { fdp = data; for (j = 0; j < i; j++, fdp++) { fdrop(fdesc->fd_ofiles[*fdp]. fde_file, td); } FILEDESC_SUNLOCK(fdesc); error = EBADF; goto out; } } fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, M_WAITOK); for (i = 0; i < oldfds; i++, fdev++, fdp++) { fde = &fdesc->fd_ofiles[*fdp]; fdep[i] = fdev; fdep[i]->fde_file = fde->fde_file; filecaps_copy(&fde->fde_caps, &fdep[i]->fde_caps, true); unp_internalize_fp(fdep[i]->fde_file); } FILEDESC_SUNLOCK(fdesc); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; case SCM_BINTIME: *controlp = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } bt = (struct bintime *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); bintime(bt); break; case SCM_REALTIME: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanotime(ts); break; case SCM_MONOTONIC: *controlp = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } ts = (struct timespec *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); nanouptime(ts); break; default: error = EINVAL; goto out; } if (*controlp != NULL) controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: if (error != 0 && initial_controlp != NULL) unp_internalize_cleanup_rights(*initial_controlp); m_freem(control); return (error); } static struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control, int mode) { struct mbuf *m, *n, *n_prev; const struct cmsghdr *cm; int ngroups, i, cmsgtype; size_t ctrlsz; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); if (mode & UNP_WANTCRED_ALWAYS) { ctrlsz = SOCKCRED2SIZE(ngroups); cmsgtype = SCM_CREDS2; } else { ctrlsz = SOCKCREDSIZE(ngroups); cmsgtype = SCM_CREDS; } m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET); if (m == NULL) return (control); if (mode & UNP_WANTCRED_ALWAYS) { struct sockcred2 *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_version = 0; sc->sc_pid = td->td_proc->p_pid; sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } else { struct sockcred *sc; sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; } /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL && cmsgtype == SCM_CREDS) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; return (m); } static struct unpcb * fptounp(struct file *fp) { struct socket *so; if (fp->f_type != DTYPE_SOCKET) return (NULL); if ((so = fp->f_data) == NULL) return (NULL); if (so->so_proto->pr_domain != &localdomain) return (NULL); return sotounpcb(so); } static void unp_discard(struct file *fp) { struct unp_defer *dr; if (unp_externalize_fp(fp)) { dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK); dr->ud_fp = fp; UNP_DEFERRED_LOCK(); SLIST_INSERT_HEAD(&unp_defers, dr, ud_link); UNP_DEFERRED_UNLOCK(); atomic_add_int(&unp_defers_count, 1); taskqueue_enqueue(taskqueue_thread, &unp_defer_task); } else closef_nothread(fp); } static void unp_process_defers(void *arg __unused, int pending) { struct unp_defer *dr; SLIST_HEAD(, unp_defer) drl; int count; SLIST_INIT(&drl); for (;;) { UNP_DEFERRED_LOCK(); if (SLIST_FIRST(&unp_defers) == NULL) { UNP_DEFERRED_UNLOCK(); break; } SLIST_SWAP(&unp_defers, &drl, unp_defer); UNP_DEFERRED_UNLOCK(); count = 0; while ((dr = SLIST_FIRST(&drl)) != NULL) { SLIST_REMOVE_HEAD(&drl, ud_link); closef_nothread(dr->ud_fp); free(dr, M_TEMP); count++; } atomic_add_int(&unp_defers_count, -count); } } static void unp_internalize_fp(struct file *fp) { struct unpcb *unp; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_file = fp; unp->unp_msgcount++; } unp_rights++; UNP_LINK_WUNLOCK(); } static int unp_externalize_fp(struct file *fp) { struct unpcb *unp; int ret; UNP_LINK_WLOCK(); if ((unp = fptounp(fp)) != NULL) { unp->unp_msgcount--; ret = 1; } else ret = 0; unp_rights--; UNP_LINK_WUNLOCK(); return (ret); } /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ static int unp_marked; static void unp_remove_dead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs--; } } static void unp_restore_undead_ref(struct filedescent **fdep, int fdcount) { struct unpcb *unp; struct file *fp; int i; /* * This function can only be called from the gc task. */ KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, ("%s: not on gc callout", __func__)); UNP_LINK_LOCK_ASSERT(); for (i = 0; i < fdcount; i++) { fp = fdep[i]->fde_file; if ((unp = fptounp(fp)) == NULL) continue; if ((unp->unp_gcflag & UNPGC_DEAD) == 0) continue; unp->unp_gcrefs++; unp_marked++; } } static void unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int)) { struct socket *so, *soa; so = unp->unp_socket; SOCK_LOCK(so); if (SOLISTENING(so)) { /* * Mark all sockets in our accept queue. */ TAILQ_FOREACH(soa, &so->sol_comp, so_list) { if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) continue; SOCKBUF_LOCK(&soa->so_rcv); unp_scan(soa->so_rcv.sb_mb, op); SOCKBUF_UNLOCK(&soa->so_rcv); } } else { /* * Mark all sockets we reference with RIGHTS. */ if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, op); SOCKBUF_UNLOCK(&so->so_rcv); } } SOCK_UNLOCK(so); } static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "Number of unreachable sockets claimed by the garbage collector."); static int unp_taskcount; SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "Number of times the garbage collector has run."); SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, "Number of active local sockets."); static void unp_gc(__unused void *arg, int pending) { struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, NULL }; struct unp_head **head; struct unp_head unp_deadhead; /* List of potentially-dead sockets. */ struct file *f, **unref; struct unpcb *unp, *unptmp; int i, total, unp_unreachable; LIST_INIT(&unp_deadhead); unp_taskcount++; UNP_LINK_RLOCK(); /* * First determine which sockets may be in cycles. */ unp_unreachable = 0; for (head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) { KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0, ("%s: unp %p has unexpected gc flags 0x%x", __func__, unp, (unsigned int)unp->unp_gcflag)); f = unp->unp_file; /* * Check for an unreachable socket potentially in a * cycle. It must be in a queue as indicated by * msgcount, and this must equal the file reference * count. Note that when msgcount is 0 the file is * NULL. */ if (f != NULL && unp->unp_msgcount != 0 && refcount_load(&f->f_count) == unp->unp_msgcount) { LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead); unp->unp_gcflag |= UNPGC_DEAD; unp->unp_gcrefs = unp->unp_msgcount; unp_unreachable++; } } /* * Scan all sockets previously marked as potentially being in a cycle * and remove the references each socket holds on any UNPGC_DEAD * sockets in its queue. After this step, all remaining references on * sockets marked UNPGC_DEAD should not be part of any cycle. */ LIST_FOREACH(unp, &unp_deadhead, unp_dead) unp_gc_scan(unp, unp_remove_dead_ref); /* * If a socket still has a non-negative refcount, it cannot be in a * cycle. In this case increment refcount of all children iteratively. * Stop the scan once we do a complete loop without discovering * a new reachable socket. */ do { unp_marked = 0; LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp) if (unp->unp_gcrefs > 0) { unp->unp_gcflag &= ~UNPGC_DEAD; LIST_REMOVE(unp, unp_dead); KASSERT(unp_unreachable > 0, ("%s: unp_unreachable underflow.", __func__)); unp_unreachable--; unp_gc_scan(unp, unp_restore_undead_ref); } } while (unp_marked); UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; /* * Allocate space for a local array of dead unpcbs. * TODO: can this path be simplified by instead using the local * dead list at unp_deadhead, after taking out references * on the file object and/or unpcb and dropping the link lock? */ unref = malloc(unp_unreachable * sizeof(struct file *), M_TEMP, M_WAITOK); /* * Iterate looking for sockets which have been specifically marked * as unreachable and store them locally. */ UNP_LINK_RLOCK(); total = 0; LIST_FOREACH(unp, &unp_deadhead, unp_dead) { KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0, ("%s: unp %p not marked UNPGC_DEAD", __func__, unp)); unp->unp_gcflag &= ~UNPGC_DEAD; f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || refcount_load(&f->f_count) != unp->unp_msgcount || !fhold(f)) continue; unref[total++] = f; KASSERT(total <= unp_unreachable, ("%s: incorrect unreachable count.", __func__)); } UNP_LINK_RUNLOCK(); /* * Now flush all sockets, free'ing rights. This will free the * struct files associated with these sockets but leave each socket * with one remaining ref. */ for (i = 0; i < total; i++) { struct socket *so; so = unref[i]->f_data; CURVNET_SET(so->so_vnet); sorflush(so); CURVNET_RESTORE(); } /* * And finally release the sockets so they can be reclaimed. */ for (i = 0; i < total; i++) fdrop(unref[i], NULL); unp_recycled += total; free(unref, M_TEMP); } static void unp_dispose_mbuf(struct mbuf *m) { if (m) unp_scan(m, unp_freerights); } /* * Synchronize against unp_gc, which can trip over data as we are freeing it. */ static void unp_dispose(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; UNP_LINK_WUNLOCK(); if (!SOLISTENING(so)) unp_dispose_mbuf(so->so_rcv.sb_mb); } static void unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) { struct mbuf *m; struct cmsghdr *cm; void *data; socklen_t clen, datalen; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { (*op)(data, datalen / sizeof(struct filedescent *)); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_nextpkt; } } /* * A helper function called by VFS before socket-type vnode reclamation. * For an active vnode it clears unp_vnode pointer and decrements unp_vnode * use count. */ void vfs_unp_reclaim(struct vnode *vp) { struct unpcb *unp; int active; struct mtx *vplock; ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim"); KASSERT(vp->v_type == VSOCK, ("vfs_unp_reclaim: vp->v_type != VSOCK")); active = 0; vplock = mtx_pool_find(mtxpool_sleep, vp); mtx_lock(vplock); VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); if (unp->unp_vnode == vp) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; active = 1; } UNP_PCB_UNLOCK(unp); done: mtx_unlock(vplock); if (active) vunref(vp); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_unpflags(int unp_flags) { int comma; comma = 0; if (unp_flags & UNP_HAVEPC) { db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ALWAYS) { db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED_ONESHOT) { db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNWAIT) { db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNECTING) { db_printf("%sUNP_CONNECTING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_BINDING) { db_printf("%sUNP_BINDING", comma ? ", " : ""); comma = 1; } } static void db_print_xucred(int indent, struct xucred *xu) { int comma, i; db_print_indent(indent); db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; for (i = 0; i < xu->cr_ngroups; i++) { db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); comma = 1; } db_printf("\n"); } static void db_print_unprefs(int indent, struct unp_head *uh) { struct unpcb *unp; int counter; counter = 0; LIST_FOREACH(unp, uh, unp_reflink) { if (counter % 4 == 0) db_print_indent(indent); db_printf("%p ", unp); if (counter % 4 == 3) db_printf("\n"); counter++; } if (counter != 0 && counter % 4 != 0) db_printf("\n"); } DB_SHOW_COMMAND(unpcb, db_show_unpcb) { struct unpcb *unp; if (!have_addr) { db_printf("usage: show unpcb \n"); return; } unp = (struct unpcb *)addr; db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, unp->unp_vnode); db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino, unp->unp_conn); db_printf("unp_refs:\n"); db_print_unprefs(2, &unp->unp_refs); /* XXXRW: Would be nice to print the full address, if any. */ db_printf("unp_addr: %p\n", unp->unp_addr); db_printf("unp_gencnt: %llu\n", (unsigned long long)unp->unp_gencnt); db_printf("unp_flags: %x (", unp->unp_flags); db_print_unpflags(unp->unp_flags); db_printf(")\n"); db_printf("unp_peercred:\n"); db_print_xucred(2, &unp->unp_peercred); db_printf("unp_refcount: %u\n", unp->unp_refcount); } #endif diff --git a/sys/net/if.c b/sys/net/if.c index bbd677e0153c..5bf44d014db3 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4608 +1,4618 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_bpf.h" #include "opt_inet6.h" #include "opt_inet.h" #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); static void if_siocaddmulti(void *, int); static void if_link_ifnet(struct ifnet *); static bool if_unlink_ifnet(struct ifnet *, bool); #ifdef VIMAGE static int if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock. * This may be acquired to stabilise the list, or we may rely on NET_EPOCH. */ struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); struct sx ifnet_detach_sxlock; SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) #ifdef VIMAGE #define VNET_IS_SHUTTING_DOWN(_vnet) \ ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE) #endif static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; if (__predict_false(idx > V_if_index)) return (NULL); ifp = *(struct ifnet * const volatile *)(V_ifindex_table + idx); return (__predict_false(ifp == IFNET_HOLD) ? NULL : ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); if (!if_try_ref(ifp)) return (NULL); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); #endif static void if_link_ifnet(struct ifnet *ifp) { IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); } static bool if_unlink_ifnet(struct ifnet *ifp, bool vmove) { struct ifnet *iter; int found = 0; IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif IFNET_WUNLOCK(); return (found); } #ifdef VIMAGE static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; struct ifnet **pending; int found, i; i = 0; /* * We need to protect our access to the V_ifnet tailq. Ordinarily we'd * enter NET_EPOCH, but that's not possible, because if_vmove() calls * if_detach_internal(), which waits for NET_EPOCH callbacks to * complete. We can't do that from within NET_EPOCH. * * However, we can also use the IFNET_xLOCK, which is the V_ifnet * read/write lock. We cannot hold the lock as we call if_vmove() * though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib * ctx lock. */ IFNET_WLOCK(); pending = malloc(sizeof(struct ifnet *) * curvnet->vnet_ifcnt, M_IFNET, M_WAITOK | M_ZERO); /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) { found = if_unlink_ifnet(ifp, true); MPASS(found); pending[i++] = ifp; } } IFNET_WUNLOCK(); for (int j = 0; j < i; j++) { if_vmove(pending[j], pending[j]->if_home_vnet); } free(pending, M_IFNET); } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); free(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { u_int old; /* We don't assert the ifnet list lock here, but arguably should. */ old = refcount_acquire(&ifp->if_refcount); KASSERT(old > 0, ("%s: ifp %p has 0 refs", __func__, ifp)); } bool if_try_ref(struct ifnet *ifp) { NET_EPOCH_ASSERT(); return (refcount_acquire_if_not_zero(&ifp->if_refcount)); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif if_link_ifnet(ifp); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); } SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeifaddr((struct in6_ifaddr *)ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { bool found; CURVNET_SET_QUIET(ifp->if_vnet); found = if_unlink_ifnet(ifp, false); if (found) { sx_xlock(&ifnet_detach_sxlock); if_detach_internal(ifp, 0, NULL); sx_xunlock(&ifnet_detach_sxlock); } CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; #ifdef VIMAGE bool shutdown; shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); #endif /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ epoch_drain_callbacks(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static int if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; #ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; #endif void *old; int rc; #ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); #endif /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return (rc); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); #ifdef DEV_BPF if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); #endif CURVNET_RESTORE(); return (0); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int error; bool found; bool shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); found = if_unlink_ifnet(ifp, true); MPASS(found); /* Move the interface into the child jail/vnet. */ error = if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int error, found; bool shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet); if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ found = if_unlink_ifnet(ifp, true); MPASS(found); error = if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Helper function to remove a group out of an interface. Expects the global * ifnet lock to be write-locked, and drops it before returning. */ static void _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl, const char *groupname) { struct ifg_member *ifgm; bool freeifgl; IFNET_WLOCK_ASSERT(); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == ifp) { CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); break; } } if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = true; } else { freeifgl = false; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } _if_delgroup_locked(ifp, ifgl, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) { strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); _if_delgroup_locked(ifp, ifgl, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; NET_EPOCH_ASSERT(); if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) return (EINVAL); bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) return (error); len -= sizeof(ifgrq); ifgp++; } return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { u_int old; old = refcount_acquire(&ifa->ifa_refcnt); KASSERT(old > 0, ("%s: ifa %p has 0 refs", __func__, ifa)); } int ifa_try_ref(struct ifaddr *ifa) { NET_EPOCH_ASSERT(); return (refcount_acquire_if_not_zero(&ifa->ifa_refcnt)); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; NET_EPOCH_ASSERT(); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; /* XXXGL: reference ifp? */ taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp; int link_state; ifp = arg; link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; case SIOCGIFDATA: { struct if_data ifd; /* Ensure uninitialised padding is not leaked. */ memset(&ifd, 0, sizeof(ifd)); if_data_copy(ifp, &ifd); error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd)); break; } #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (strcmp(new_name, ifp->if_xname) == 0) break; if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET DEBUGNET_NOTIFY_MTU(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = if_getgroup((struct ifgroupreq *)data, ifp); NET_EPOCH_EXIT(et); break; } case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; struct ifmediareq *ifmrp = NULL; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE bool shutdown; #endif + /* + * Interface ioctls access a global namespace. There is currently no + * capability-based representation for interfaces, so the configuration + * interface is simply unaccessible from capability mode. If necessary, + * select ioctls may be permitted here. + */ + if (IN_CAPABILITY_MODE(td)) + return (ECAPMODE); + CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet); if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); goto out_noref; #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); if (error == 0) ifc32->ifc_len = ifc.ifc_len; goto out_noref; } #endif } #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) { sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(ifr->ifr_name); sx_xunlock(&ifnet_detach_sxlock); } goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to maxphys to avoid DoS from userspace. */ max_len = maxphys - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { if (THREAD_CAN_SLEEP()) (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); else taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } static void if_siocaddmulti(void *arg, int pending) { struct ifnet *ifp; ifp = arg; #ifdef DIAGNOSTIC if (pending > 1) if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending); #endif CURVNET_SET(ifp->if_vnet); (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); CURVNET_RESTORE(); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; KASSERT(ifp, ("%s: NULL ifp", __func__)); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; NET_EPOCH_EXIT(et); if (ifp != oifp) ifp = NULL; } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; ifa = ifp->if_addr; if (ifa == NULL) return (EINVAL); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) return (EINVAL); if (len != sdl->sdl_alen) /* don't allow length to change */ return (EINVAL); switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } static int if_vlog(struct ifnet *ifp, int pri, const char *fmt, va_list ap) { char if_fmt[256]; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); vlog(pri, if_fmt, ap); return (0); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if_vlog(ifp, LOG_INFO, fmt, ap); va_end(ap); return (0); } int if_log(struct ifnet *ifp, int pri, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if_vlog(ifp, pri, fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late invocation of if_destroy(), which leads * to memory leak from if_com_alloc[type] allocated if_l2com. */ epoch_drain_callbacks(net_epoch_preempt); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ifp->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } /* * Methods for drivers to access interface unicast and multicast * link level addresses. Driver shall not know 'struct ifaddr' neither * 'struct ifmultiaddr'. */ u_int if_lladdr_count(if_t ifp) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr, count); } NET_EPOCH_EXIT(et); return (count); } u_int if_llmaddr_count(if_t ifp) { struct epoch_tracker et; struct ifmultiaddr *ifma; int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) if (ifma->ifma_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifmultiaddr *ifma; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr, count); } NET_EPOCH_EXIT(et); return (count); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } diff --git a/sys/net/route.c b/sys/net/route.c index 2416aa9a983f..f093a71b7585 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -1,747 +1,750 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" #include "opt_route.h" #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rtstat); #endif EVENTHANDLER_LIST_DEFINE(rt_addrmsg); static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, void *arg); static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info, int flags); /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); struct rib_head * rt_table_init(int offset, int family, u_int fibnum) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Save metadata associated with this routing table. */ rh->rib_family = family; rh->rib_fibnum = fibnum; #ifdef VIMAGE rh->rib_vnet = curvnet; #endif tmproutes_init(rh); /* Init locks */ RIB_LOCK_INIT(rh); nhops_init_rib(rh); /* Init subscription system */ rib_init_subscriptions(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { RIB_WLOCK(rh); rh->rib_dying = true; RIB_WUNLOCK(rh); #ifdef FIB_ALGO fib_destroy_rib(rh); #endif tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); nhops_destroy_rib(rh); rib_destroy_subscriptions(rh); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } /* * Adds a temporal redirect entry to the routing table. * @fibnum: fib number * @dst: destination to install redirect to * @gateway: gateway to go via * @author: sockaddr of originating router, can be NULL * @ifp: interface to use for the redirected route * @flags: set of flags to add. Allowed: RTF_GATEWAY * @lifetime_sec: time in seconds to expire this redirect. * * Retuns 0 on success, errno otherwise. */ int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec) { struct rib_cmd_info rc; int error; struct rt_addrinfo info; struct rt_metrics rti_rmx; struct ifaddr *ifa; NET_EPOCH_ASSERT(); if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL) return (EAFNOSUPPORT); /* Verify the allowed flag mask. */ KASSERT(((flags & ~(RTF_GATEWAY)) == 0), ("invalid redirect flags: %x", flags)); flags |= RTF_HOST | RTF_DYNAMIC; /* Get the best ifa for the given interface and gateway. */ if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL) return (ENETUNREACH); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_ifa = ifa; info.rti_ifp = ifp; info.rti_flags = flags; /* Setup route metrics to define expire time. */ bzero(&rti_rmx, sizeof(rti_rmx)); /* Set expire time as absolute. */ rti_rmx.rmx_expire = lifetime_sec + time_second; info.rti_mflags |= RTV_EXPIRE; info.rti_rmx = &rti_rmx; error = rib_action(fibnum, RTM_ADD, &info, &rc); if (error != 0) { /* TODO: add per-fib redirect stats. */ return (error); } RTSTAT_INC(rts_dynamic); /* Send notification of a route addition to userland. */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_AUTHOR] = author; rt_missmsg_fib(RTM_REDIRECT, &info, flags | RTF_UP, error, fibnum); return (0); } /* * Routing table ioctl interface. */ int -rtioctl_fib(u_long req, caddr_t data, u_int fibnum) +rtioctl_fib(u_long req, caddr_t data, u_int fibnum, struct thread *td) { + if (IN_CAPABILITY_MODE(td)) + return (ECAPMODE); /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, const struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct nhop_object *nh; nh = rib_lookup(fibnum, gateway, NHR_NONE, 0); /* * dismiss a gateway that is reachable only * through the default router */ if ((nh == NULL) || (nh->nh_flags & NHF_DEFAULT)) return (NULL); ifa = nh->nh_ifa; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } return (ifa); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp and rt_ifa. * * Returns 0 on success. */ static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = &nh->gw_sa; dst = info->rti_info[RTAX_GATEWAY]; if ((nhop_get_rtflags(nh) & RTF_GATEWAY) && src != NULL && dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (nhop_get_rtflags(nh) & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = nh->nh_mtu; } info->rti_flags = rt->rte_flags | nhop_get_rtflags(nh); info->rti_ifp = nh->nh_ifp; info->rti_ifa = nh->nh_ifa; if (flags & NHR_REF) { if_ref(info->rti_ifp); ifa_ref(info->rti_ifa); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * If @flags contains NHR_REF, refcouting is performed on rt_ifp and rt_ifa. * All references can be released later by calling rib_free_info(). * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); nh = nhop_select(rt->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, nh, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { ifa_free(info->rti_ifa); if_rele(info->rti_ifp); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * nh pointer to nhop * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { struct ifnet *ifp = arg; if (nh->nh_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rte_flags & RTF_UP) == 0) return (0); return (1); } void rt_flushifroutes(struct ifnet *ifp) { rib_foreach_table_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Look up rt_addrinfo for a specific fib. * * Assume basic consistency checks are executed by callers: * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { const struct sockaddr *dst, *gateway, *ifpaddr, *ifaaddr; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; ifpaddr = info->rti_info[RTAX_IFP]; ifaaddr = info->rti_info[RTAX_IFA]; flags = info->rti_flags; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; /* If we have interface specified by the ifindex in the address, use it */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr; if (sdl->sdl_index != 0) info->rti_ifp = ifnet_byindex(sdl->sdl_index); } /* * If we have source address specified, try to find it * TODO: avoid enumerating all ifas on all interfaces. */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { const struct sockaddr *sa; /* * Most common use case for the userland-supplied routes. * * Choose sockaddr to select ifa. * -- if ifp is set -- * Order of preference: * 1) IFA address * 2) gateway address * Note: for interface routes link-level gateway address * is specified to indicate the interface index without * specifying RTF_GATEWAY. In this case, ignore gateway * Note: gateway AF may be different from dst AF. In this case, * ignore gateway * 3) final destination. * 4) if all of these fails, try to get at least link-level ifa. * -- else -- * try to lookup gateway or dst in the routing table to get ifa */ if (info->rti_info[RTAX_IFA] != NULL) sa = info->rti_info[RTAX_IFA]; else if ((info->rti_flags & RTF_GATEWAY) != 0 && gateway->sa_family == dst->sa_family) sa = gateway; else sa = dst; if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); /* Case 4 */ if (info->rti_ifa == NULL && gateway != NULL) info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } void rt_updatemtu(struct ifnet *ifp) { struct rib_head *rnh; int mtu; int i, j; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; nhops_update_ifmtu(rnh, ifp, mtu); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa); } return (i); } #endif void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT((fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd); if (V_rt_add_addr_allfibs) fibnum = RT_ALL_FIBS; return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @rt: valid rtentry * @nh: nhop object to announce * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, rt, nh, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @info: addrinfo structure with valid data. * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__)); return (rtsock_routemsg_info(cmd, info, fibnum)); } diff --git a/sys/net/route.h b/sys/net/route.h index 3fdca303596e..64e89965f9cd 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -1,449 +1,451 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a * preformed header. They are not cacheable. */ char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ #define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_nhidx; /* route nexhop index */ u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight /* default route weight */ #define RT_DEFAULT_WEIGHT 1 #define RT_MAX_WEIGHT 16777215 /* 3 bytes */ /* * Keep a generation count of routing table, incremented on route addition, * so we can invalidate caches. This is accessed without a lock, as precision * is not required. */ typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ #define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL VNET_DECLARE(uint32_t, _rt_numfibs); /* number of existing route tables */ #define V_rt_numfibs VNET(_rt_numfibs) /* temporary compat arg */ #define rt_numfibs V_rt_numfibs VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) /* Calculate flowid for locally-originated packets */ #define V_fib_hash_outbound VNET(fib_hash_outbound) VNET_DECLARE(u_int, fib_hash_outbound); /* Outbound flowid generation rules */ #ifdef RSS #define fib4_calc_packet_hash xps_proto_software_hash_v4 #define fib6_calc_packet_hash xps_proto_software_hash_v6 #define CALC_FLOWID_OUTBOUND_SENDTO true #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND false #endif #else /* !RSS */ #define fib4_calc_packet_hash fib4_calc_software_hash #define fib6_calc_packet_hash fib6_calc_software_hash #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND_SENDTO V_fib_hash_outbound #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND_SENDTO false #define CALC_FLOWID_OUTBOUND false #endif #endif /* RSS */ #endif /* _KERNEL */ /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ /* 0x40000000 unused, was RTF_RNH_LOCKED */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_NONE 0x00 /* empty flags field */ #define NHR_REF 0x01 /* reference nexhop */ #define NHR_NODEFAULT 0x02 /* uRPF: do not consider default route */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #define NHR_UNLOCKED 0x200 /* Do not lock table */ /* * Routing statistics. */ struct rtstat { uint64_t rts_badredirect; /* bogus redirect calls */ uint64_t rts_dynamic; /* routes created by redirects */ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ uint64_t rts_add_failure; /* # of route addition failures */ uint64_t rts_add_retry; /* # of route addition retries */ uint64_t rts_del_failure; /* # of route deletion failure */ uint64_t rts_del_retry; /* # of route deletion retries */ }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ u_short _rtm_spare1; int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* * Message types. * * The format for each message is annotated below using the following * identifiers: * * (1) struct rt_msghdr * (2) struct ifa_msghdr * (3) struct if_msghdr * (4) struct ifma_msghdr * (5) struct if_announcemsghdr * */ #define RTM_ADD 0x1 /* (1) Add Route */ #define RTM_DELETE 0x2 /* (1) Delete Route */ #define RTM_CHANGE 0x3 /* (1) Change Metrics or flags */ #define RTM_GET 0x4 /* (1) Report Metrics */ #define RTM_LOSING 0x5 /* (1) Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* (1) Told to use different route */ #define RTM_MISS 0x7 /* (1) Lookup failed on this address */ #define RTM_LOCK 0x8 /* (1) fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* (1) req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* (2) address being added to iface */ #define RTM_DELADDR 0xd /* (2) address being removed from iface */ #define RTM_IFINFO 0xe /* (3) iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* (4) mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* (4) mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* (5) iface arrival/departure */ #define RTM_IEEE80211 0x12 /* (5) IEEE80211 wireless event */ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ struct rtentry; struct nhop_object; typedef int rib_filter_f_t(const struct rtentry *, const struct nhop_object *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rib_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter paramenters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RO_NHFREE(_ro) do { \ if ((_ro)->ro_nh) { \ NH_FREE((_ro)->ro_nh); \ (_ro)->ro_nh = NULL; \ } \ } while (0) #define RO_INVALIDATE_CACHE(ro) do { \ if ((ro)->ro_lle != NULL) { \ LLE_FREE((ro)->ro_lle); \ (ro)->ro_lle = NULL; \ } \ if ((ro)->ro_nh != NULL) { \ NH_FREE((ro)->ro_nh); \ (ro)->ro_nh = NULL; \ } \ } while (0) /* * Validate a cached route based on a supplied cookie. If there is an * out-of-date cache, simply free it. Update the generation number * for the new allocation */ #define NH_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ RO_INVALIDATE_CACHE(ro); \ *(cookiep) = cookie; \ } \ } while (0) struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct rtentry *, struct nhop_object *, int); int rt_routemsg_info(int, struct rt_addrinfo *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); struct rib_head *rt_table_init(int, int, u_int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(uint32_t table, sa_family_t family); struct sockaddr *rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask); void rt_updatemtu(struct ifnet *); void rt_flushifroutes(struct ifnet *ifp); +struct thread; + /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ -int rtioctl_fib(u_long, caddr_t, u_int); +int rtioctl_fib(u_long, caddr_t, u_int, struct thread *); int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); /* New API */ void rib_flush_routes_family(int family); struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); #endif #endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c index bcf071a81e0e..5f70dd1ec824 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1,1711 +1,1715 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_gifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static bool ia_need_loopback_route(const struct in_ifaddr *); VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) return (1); } return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *original_ia) { struct rm_priotracker in_ifa_tracker; in_addr_t original_addr = IA_SIN(original_ia)->sin_addr.s_addr; uint32_t original_fib = original_ia->ia_ifa.ifa_ifp->if_fib; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(original_addr), ia_hash) { in_addr_t addr = IA_SIN(ia)->sin_addr.s_addr; uint32_t fib = ia->ia_ifa.ifa_ifp->if_fib; if (!V_rt_add_addr_allfibs && (original_fib != fib)) continue; if ((original_ia != ia) && (original_addr == addr)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (ia); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_ZERONET(i) || IN_LOOPBACK(i)) return (0); return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); + if (td != NULL && IN_CAPABILITY_MODE(td)) + return (ECAPMODE); + /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCGIFALIAS: sx_xlock(&in_control_sx); error = in_gifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } NET_EPOCH_EXIT(et); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; else iaIsFirst = false; } NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { error = in_addprefix(ia); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && ia_need_loopback_route(ia)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } /* * Note: we don't need extra reference for ifa, since we called * with sx lock held, and ifaddr can not be deleted in concurrent * thread. */ EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } static int in_gifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; /* * ifra_addr must be present and be of INET family. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); /* * See whether address exist. */ ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) { ia = it; break; } } if (ia == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } ifra->ifra_mask = ia->ia_sockmask; if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_family == AF_INET) ifra->ifra_dstaddr = ia->ia_dstaddr; else if ((ifp->if_flags & IFF_BROADCAST) && ia->ia_broadaddr.sin_family == AF_INET) ifra->ifra_broadaddr = ia->ia_broadaddr; else memset(&ifra->ifra_broadaddr, 0, sizeof(ifra->ifra_broadaddr)); NET_EPOCH_EXIT(et); return (0); } static int in_match_ifaddr(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { if (nh->nh_ifa == (struct ifaddr *)arg) return (1); return (0); } static int in_handle_prefix_route(uint32_t fibnum, int cmd, struct sockaddr_in *dst, struct sockaddr_in *netmask, struct ifaddr *ifa, struct ifnet *ifp) { NET_EPOCH_ASSERT(); /* Prepare gateway */ struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_type = ifa->ifa_ifp->if_type, .sdl_index = ifa->ifa_ifp->if_index, }; struct rt_addrinfo info = { .rti_ifa = ifa, .rti_ifp = ifp, .rti_flags = RTF_PINNED | ((netmask != NULL) ? 0 : RTF_HOST), .rti_info = { [RTAX_DST] = (struct sockaddr *)dst, [RTAX_NETMASK] = (struct sockaddr *)netmask, [RTAX_GATEWAY] = (struct sockaddr *)&sdl, }, /* Ensure we delete the prefix IFF prefix ifa matches */ .rti_filter = in_match_ifaddr, .rti_filterdata = ifa, }; return (rib_handle_ifaddr_info(fibnum, cmd, &info)); } /* * Routing table interaction with interface addresses. * * In general, two types of routes needs to be installed: * a) "interface" or "prefix" route, telling user that the addresses * behind the ifa prefix are reached directly. * b) "loopback" route installed for the ifa address, telling user that * the address belongs to local system. * * Handling for (a) and (b) differs in multi-fib aspects, hence they * are implemented in different functions below. * * The cases above may intersect - /32 interface aliases results in * the same prefix produced by (a) and (b). This blurs the definition * of the "loopback" route and complicate interactions. The interaction * table is defined below. The case numbers are used in the multiple * functions below to refer to the particular test case. * * There can be multiple options: * 1) Adding address with prefix on non-p2p/non-loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" route towards 192.0.2.0/24 via @ia interface, * using @ia as an address source. * * add "loopback" route towards 192.0.2.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 2) Adding address with /32 mask to non-p2p/non-loopback interface. * Example: 192.0.2.2/32. Action: * * add "prefix" host route via V_loif, using @ia as an address source. * * 3) Adding address with or without prefix to p2p interface. * Example: 10.0.0.1/24->10.0.0.2. Action: * * add "prefix" host route towards 10.0.0.2 via this interface, using @ia * as an address source. Note: no sense in installing full /24 as the interface * is point-to-point. * * add "loopback" route towards 10.0.9.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 4) Adding address with or without prefix to loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" host route via @ia interface, using @ia as an address source. * Note: Skip installing /24 prefix as it would introduce TTL loop * for the traffic destined to these addresses. */ /* * Checks if @ia needs to install loopback route to @ia address via * ifa_maintain_loopback_route(). * * Return true on success. */ static bool ia_need_loopback_route(const struct in_ifaddr *ia) { struct ifnet *ifp = ia->ia_ifp; /* Case 4: Skip loopback interfaces */ if ((ifp->if_flags & IFF_LOOPBACK) || (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)) return (false); /* Clash avoidance: Skip p2p interfaces with both addresses are equal */ if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) return (false); /* Case 2: skip /32 prefixes */ if (!(ifp->if_flags & IFF_POINTOPOINT) && (ia->ia_sockmask.sin_addr.s_addr == INADDR_BROADCAST)) return (false); return (true); } /* * Calculate "prefix" route corresponding to @ia. */ static void ia_getrtprefix(const struct in_ifaddr *ia, struct in_addr *prefix, struct in_addr *mask) { if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) { /* Case 3: return host route for dstaddr */ *prefix = ia->ia_dstaddr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else if (ia->ia_ifp->if_flags & IFF_LOOPBACK) { /* Case 4: return host route for ifaddr */ *prefix = ia->ia_addr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else { /* Cases 1,2: return actual ia prefix */ *prefix = ia->ia_addr.sin_addr; *mask = ia->ia_sockmask.sin_addr; prefix->s_addr &= mask->s_addr; } } /* * Adds or delete interface "prefix" route corresponding to @ifa. * Returns 0 on success or errno. */ int in_handle_ifaddr_route(int cmd, struct in_ifaddr *ia) { struct ifaddr *ifa = &ia->ia_ifa; struct in_addr daddr, maddr; struct sockaddr_in *pmask; struct epoch_tracker et; int error; ia_getrtprefix(ia, &daddr, &maddr); struct sockaddr_in mask = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr = maddr, }; pmask = (maddr.s_addr != INADDR_BROADCAST) ? &mask : NULL; struct sockaddr_in dst = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr.s_addr = daddr.s_addr & maddr.s_addr, }; struct ifnet *ifp = ia->ia_ifp; if ((maddr.s_addr == INADDR_BROADCAST) && (!(ia->ia_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)))) { /* Case 2: host route on broadcast interface */ ifp = V_loif; } uint32_t fibnum = ifa->ifa_ifp->if_fib; NET_EPOCH_ENTER(et); error = in_handle_prefix_route(fibnum, cmd, &dst, pmask, ifa, ifp); NET_EPOCH_EXIT(et); return (error); } /* * Check if we have a route for the given prefix already. */ static bool in_hasrtprefix(struct in_ifaddr *target) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; bool result = false; ia_getrtprefix(target, &prefix, &mask); IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { result = true; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (result); } int in_addprefix(struct in_ifaddr *target) { int error; if (in_hasrtprefix(target)) { if (V_nosameprefix) return (EEXIST); else { rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); return (0); } } /* * No-one seem to have this prefix route, so we try to insert it. */ rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_ADD, target); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if (ia_need_loopback_route(target) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } ia_getrtprefix(target, &prefix, &mask); if ((target->ia_flags & IFA_ROUTE) == 0) { rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = in_handle_ifaddr_route(RTM_ADD, ia); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; NET_EPOCH_ASSERT(); if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); /* * Make sure all multicast deletions invoking if_ioctl() are * completed before returning. Else we risk accessing a freed * ifnet structure pointer. */ inm_release_wait(NULL); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); if (__predict_false(ifma_restart)) { ifma_restart = true; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 11688e3cb1d2..351c90699fc2 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -1,345 +1,347 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mrouting.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include /* * While this file provides the domain and protocol switch tables for IPv4, it * also provides the sysctl node declarations for net.inet.* often shared with * IPv6 for common features or by upper layer protocols. In case of no IPv4 * support compile out everything but these sysctl nodes. */ #ifdef INET #include #include #include #include #endif /* INET */ #if defined(INET) || defined(INET6) #include #endif #ifdef INET #include #include #include #include #include #include #include #include #include #include #include #include /* * TCP/IP protocol family: IP, ICMP, UDP, TCP. */ static struct pr_usrreqs nousrreqs; #ifdef SCTP #include #include #include #include #endif FEATURE(inet, "Internet Protocol version 4"); extern struct domain inetdomain; /* Spacer for loadable protocols. */ #define IPPROTOSPACER \ { \ .pr_domain = &inetdomain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inetsw[] = { { .pr_type = 0, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IP, + .pr_flags = PR_CAPATTACH, .pr_init = ip_init, .pr_slowtimo = ip_slowtimo, .pr_drain = ip_drain, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDP, - .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_init = udp_init, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, - .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| + PR_CAPATTACH, .pr_input = tcp_input, .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_init = sctp_init, .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_UDPLITE, - .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp_input, .pr_ctlinput = udplite_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_init = udplite_init, .pr_usrreqs = &udp_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctlinput = rip_ctlinput, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ICMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IGMP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = igmp_input, .pr_ctloutput = rip_ctloutput, .pr_fasttimo = igmp_fasttimo, .pr_slowtimo = igmp_slowtimo, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_RSVP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = rsvp_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_MOBILE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_ETHERIP, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, # ifdef INET6 { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, #endif { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, IPPROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, .pr_usrreqs = &rip_usrreqs }, }; struct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[nitems(inetsw)], .dom_rtattach = in_inithead, #ifdef VIMAGE .dom_rtdetach = in_detachhead, #endif .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach }; VNET_DOMAIN_SET(inet); #endif /* INET */ SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet Family"); SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP"); SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP"); SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP"); SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP"); #endif SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IGMP"); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* XXX no protocol # to use, pick something "reserved" */ SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC"); SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "AH"); SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ESP"); SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "RAW"); SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Accept filters"); diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 02cb9df7da3a..de3db6dc7d33 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -1,2597 +1,2601 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct rt_addrinfo info; struct ifaddr *ifa; struct sockaddr_dl gateway; int fibnum; ifa = &ia->ia_ifa; /* * Prepare info data for the host route. * This code mimics one from ifa_maintain_loopback_route(). */ bzero(&info, sizeof(struct rt_addrinfo)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ifa->ifa_addr; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gateway; link_init_sdl(ifa->ifa_ifp, (struct sockaddr *)&gateway, ifa->ifa_ifp->if_type); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; fibnum = ia62ifa(ia)->ifa_ifp->if_fib; if (cmd == RTM_ADD) { rt_addrmsg(cmd, &ia->ia_ifa, fibnum); rt_routemsg_info(cmd, &info, fibnum); } else if (cmd == RTM_DELETE) { rt_routemsg_info(cmd, &info, fibnum); rt_addrmsg(cmd, &ia->ia_ifa, fibnum); } } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; + if (td != NULL && IN_CAPABILITY_MODE(td)) + return (ECAPMODE); + /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } break; } case SIOCDIFADDR_IN6: in6_purgeifaddr(ia); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); break; default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } static struct in6_multi_mship * in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, int *errorp, int delay) { struct in6_multi_mship *imm; int error; imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); if (imm == NULL) { *errorp = ENOBUFS; return (NULL); } delay = (delay * PR_FASTHZ) / hz; error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); if (error) { *errorp = error; free(imm, M_IP6MADDR); return (NULL); } return (imm); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } /* * Adds or deletes interface route for p2p ifa. * Returns 0 on success or errno. */ static int in6_handle_dstaddr_rtrequest(int cmd, struct in6_ifaddr *ia) { struct epoch_tracker et; struct ifaddr *ifa = &ia->ia_ifa; int error; /* Prepare gateway */ struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_type = ifa->ifa_ifp->if_type, .sdl_index = ifa->ifa_ifp->if_index, }; struct sockaddr_in6 dst = { .sin6_family = AF_INET6, .sin6_len = sizeof(struct sockaddr_in6), .sin6_addr = ia->ia_dstaddr.sin6_addr, }; struct rt_addrinfo info = { .rti_ifa = ifa, .rti_flags = RTF_PINNED | RTF_HOST, .rti_info = { [RTAX_DST] = (struct sockaddr *)&dst, [RTAX_GATEWAY] = (struct sockaddr *)&sdl, }, }; /* Don't set additional per-gw filters on removal */ NET_EPOCH_ENTER(et); error = rib_handle_ifaddr_info(ifa->ifa_ifp->if_fib, cmd, &info); NET_EPOCH_EXIT(et); return (error); } static bool ifa_is_p2p(struct in6_ifaddr *ia) { int plen; plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((plen == 128) && (ia->ia_dstaddr.sin6_family == AF_INET6) && !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &ia->ia_dstaddr.sin6_addr)) return (true); return (false); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr != NULL) in6_leavegroup(imm->i6mm_maddr, NULL); free(imm, M_IP6MADDR); } /* Check if we need to remove p2p route */ if ((ia->ia_flags & IFA_ROUTE) && ifa_is_p2p(ia)) { error = in6_handle_dstaddr_rtrequest(RTM_DELETE, ia); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } /* * Removes @ia from the corresponding interfaces and unlinks corresponding * prefix if no addresses are using it anymore. */ void in6_purgeifaddr(struct in6_ifaddr *ia) { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } NET_EPOCH_EXIT(et); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto done; } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (in6_handle_dstaddr_rtrequest(RTM_DELETE, ia) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ if (!(ia->ia_flags & IFA_ROUTE) && ifa_is_p2p(ia)) { error = in6_handle_dstaddr_rtrequest(RTM_ADD, ia); if (error) goto done; ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } done: WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Invoking IPv6 network device address event may sleep"); ifa_ref(&ia->ia_ifa); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_ADD); ifa_free(&ia->ia_ifa); return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced if @referenced flag is set. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid, bool referenced) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; if (referenced) ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct epoch_tracker et; struct ifaddr *ifa; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct epoch_tracker et; struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; NET_EPOCH_ASSERT(); in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) return (1); } return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ NET_EPOCH_ASSERT(); dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) return (besta); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) return dep[0]; if (dep[1]) return dep[1]; return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } NET_EPOCH_EXIT(et); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (0); if ((ND_IFINFO(ifp)->flags & (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { struct epoch_tracker et; unsigned long maxmtu = 0; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } NET_EPOCH_EXIT(et); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop_object *nh; struct in6_addr dst; uint32_t scopeid; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; NET_EPOCH_ASSERT(); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; nh = fib6_lookup(fibnum, &dst, scopeid, NHR_NONE, 0); if (nh && ((nh->nh_flags & NHF_GATEWAY) || nh->nh_ifp != ifp)) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 21b7d660676f..56ef41a27e88 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -1,608 +1,610 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_sctp.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ FEATURE(inet6, "Internet Protocol version 6"); extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 /* Spacer for loadable protocols. */ #define IP6PROTOSPACER \ { \ .pr_domain = &inet6domain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inet6sw[] = { { .pr_type = 0, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, + .pr_flags = PR_CAPATTACH, .pr_init = ip6_init, .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, - .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udp_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, - .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD|PR_LISTEN, + .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD| + PR_LISTEN|PR_CAPATTACH, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, #ifndef INET /* don't call initialization, timeout, and drain routines twice */ .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, #endif .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, #ifndef INET /* Do not call initialization and drain routines twice. */ .pr_drain = sctp_drain, .pr_init = sctp_init, #endif .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LASTHDR, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = NULL, /* Covered by the SOCK_SEQPACKET entry. */ .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDPLITE, - .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_CAPATTACH, .pr_input = udp6_input, .pr_ctlinput = udplite6_ctlinput, .pr_ctloutput = udp_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udplite_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = rip_init, #endif .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_fasttimo = icmp6_fasttimo, .pr_slowtimo = icmp6_slowtimo, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, .pr_usrreqs = &nousrreqs }, #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, /* Spacer n-times for loadable protocols. */ IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, .dom_protoswNPROTOSW = (struct protosw *)&inet6sw[nitems(inet6sw)], .dom_rtattach = in6_inithead, #ifdef VIMAGE .dom_rtdetach = in6_detachhead, #endif .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_ifmtu = in6_domifmtu }; VNET_DOMAIN_SET(inet6); /* * Internet configuration info */ #ifndef IPV6FORWARDING #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif /* GATEWAY6 */ #endif /* !IPV6FORWARDING */ #ifndef IPV6_SENDREDIRECTS #define IPV6_SENDREDIRECTS 1 #endif VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING; /* act as router? */ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ VNET_DEFINE(int, ip6_dad_count) = 1; /* DupAddrDetectionTransmits */ VNET_DEFINE(int, ip6_auto_flowlabel) = 1; VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr * (RFC2462 5.5.4) */ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix * walk list every 5 sec. */ VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */ VNET_DEFINE(int, ip6_v6only) = 1; VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L; #ifdef IPSTEALTH VNET_DEFINE(int, ip6stealth) = 0; #endif VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS * (RFC 4861) */ /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ VNET_DEFINE(int, pmtu_expire) = 60*10; VNET_DEFINE(int, pmtu_probe) = 60*2; /* ICMPV6 parameters */ VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */ VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ VNET_DEFINE(int, icmp6errppslim) = 100; /* 100pps */ /* control how to respond to NI queries */ VNET_DEFINE(int, icmp6_nodeinfo) = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1; /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP6"); #if defined(SCTP) || defined(SCTP_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SCTP6"); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_preferred_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_desync_factor + V_ip6_temp_regen_advance) return (EINVAL); V_ip6_temp_preferred_lifetime = val; return (0); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { int error, val; val = V_ip6_temp_valid_lifetime; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || !req->newptr) return (error); if (val < V_ip6_temp_preferred_lifetime) return (EINVAL); V_ip6_temp_valid_lifetime = val; return (0); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "Enable forwarding of IPv6 packets between interfaces"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, "Send ICMPv6 redirects for unforwardable IPv6 packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0, "Default hop limit to use for outgoing IPv6 packets"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 RA messages"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, "Default value of per-interface flag to control whether routers " "sending ICMPv6 RA messages on that interface are added into the " "default router list"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0, "Always set clear the R flag in ICMPv6 NA messages when accepting RA " "on the interface"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, "Accept the default router list from ICMPv6 RA messages even " "when packet forwarding is enabled"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "Frequency in seconds at which to log IPv6 forwarding errors"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "Default maximum number of IPv6 extension headers permitted on " "incoming IPv6 packets, 0 for no artificial limit"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, "Number of ICMPv6 NS messages sent during duplicate address detection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "Provide an IPv6 flowlabel in outbound packets"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "Default hop limit for IPv6 multicast packets originating from this " "node"); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, "KAME version string"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "Allow the use of addresses whose preferred lifetimes have expired"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "Create RFC3041 temporary addresses for autoconfigured addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_temppltime, "I", "Maximum preferred lifetime for temporary addresses"); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_ip6_tempvltime, "I", "Maximum valid lifetime for temporary addresses"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, "Restrict AF_INET6 sockets to IPv6 addresses only"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, "Default value of per-interface flag for automatically adding an IPv6 " "link-local address to interfaces when attached"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, struct rip6stat, rip6stat, "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "Prefer RFC3041 temporary addresses in source address selection"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, "Use the default scope zone when none is specified"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "Enable path MTU discovery for multicast packets"); #ifdef IPSTEALTH SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, "Forward IPv6 packets without decrementing their TTL"); #endif /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "Accept ICMPv6 redirect messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "Delay in seconds before expiring redirect route"); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, "Frequency in seconds of checks for expired prefixes and routers"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, "Delay in seconds before probing for reachability"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "Number of ICMPv6 NS messages sent during reachability detection"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "Number of ICMPv6 NS messages sent during address resolution"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "Create a loopback route when configuring an IPv6 address"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "Mask of enabled RFC4620 node information query types"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, "Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup " "for compatibility with KAME implementation"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "Maximum number of ICMPv6 error messages per second"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); /* XXX unused */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, "Log NDP debug messages"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861"); #ifdef EXPERIMENTAL SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_ignore_ipv6_only_ra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_ignore_ipv6_only_ra), 0, "Ignore the 'IPv6-Only flag' in RA messages in compliance with " "draft-ietf-6man-ipv6only-flag"); #endif diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h index 9c3139c866bd..5c2fa2d4077e 100644 --- a/sys/sys/protosw.h +++ b/sys/sys/protosw.h @@ -1,353 +1,353 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)protosw.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_PROTOSW_H_ #define _SYS_PROTOSW_H_ /* Forward declare these structures referenced from prototypes below. */ struct kaiocb; struct mbuf; struct thread; struct sockaddr; struct socket; struct sockopt; /*#ifdef _KERNEL*/ /* * Protocol switch table. * * Each protocol has a handle initializing one of these structures, * which is used for protocol-protocol and system-protocol communication. * * A protocol is called through the pr_init entry before any other. * Thereafter it is called every 200ms through the pr_fasttimo entry and * every 500ms through the pr_slowtimo for timer based actions. * The system will call the pr_drain entry if it is low on space and * this should throw away any non-critical data. * * Protocols pass data between themselves as chains of mbufs using * the pr_input and pr_output hooks. Pr_input passes data up (towards * the users) and pr_output passes it down (towards the interfaces); control * information passes up and down on pr_ctlinput and pr_ctloutput. * The protocol is responsible for the space occupied by any the * arguments to these entries and must dispose it. * * In retrospect, it would be a lot nicer to use an interface * similar to the vnode VOP interface. */ /* USE THESE FOR YOUR PROTOTYPES ! */ typedef int pr_input_t (struct mbuf **, int*, int); typedef int pr_output_t (struct mbuf *, struct socket *, ...); typedef void pr_ctlinput_t (int, struct sockaddr *, void *); typedef int pr_ctloutput_t (struct socket *, struct sockopt *); typedef void pr_init_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); typedef void pr_drain_t (void); struct protosw { short pr_type; /* socket type used for */ struct domain *pr_domain; /* domain protocol a member of */ short pr_protocol; /* protocol number */ short pr_flags; /* see below */ /* protocol-protocol hooks */ pr_input_t *pr_input; /* input to protocol (from below) */ pr_output_t *pr_output; /* output to protocol (from above) */ pr_ctlinput_t *pr_ctlinput; /* control input (from below) */ pr_ctloutput_t *pr_ctloutput; /* control output (from above) */ /* utility hooks */ pr_init_t *pr_init; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ struct pr_usrreqs *pr_usrreqs; /* user-protocol hook */ }; /*#endif*/ #define PR_SLOWHZ 2 /* 2 slow timeouts per second */ #define PR_FASTHZ 5 /* 5 fast timeouts per second */ /* * This number should be defined again within each protocol family to avoid * confusion. */ #define PROTO_SPACER 32767 /* spacer for loadable protocols */ /* * Values for pr_flags. * PR_ADDR requires PR_ATOMIC; * PR_ADDR and PR_CONNREQUIRED are mutually exclusive. * PR_IMPLOPCL means that the protocol allows sendto without prior connect, * and the protocol understands the MSG_EOF flag. The first property is * is only relevant if PR_CONNREQUIRED is set (otherwise sendto is allowed * anyhow). */ #define PR_ATOMIC 0x01 /* exchange atomic messages only */ #define PR_ADDR 0x02 /* addresses given with messages */ #define PR_CONNREQUIRED 0x04 /* connection required by protocol */ #define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */ #define PR_RIGHTS 0x10 /* passes capabilities */ #define PR_IMPLOPCL 0x20 /* implied open/close */ #define PR_LASTHDR 0x40 /* enforce ipsec policy; last header */ +#define PR_CAPATTACH 0x80 /* socket can attach in cap mode */ /* * In earlier BSD network stacks, a single pr_usrreq() function pointer was * invoked with an operation number indicating what operation was desired. * We now provide individual function pointers which protocols can implement, * which offers a number of benefits (such as type checking for arguments). * These older constants are still present in order to support TCP debugging. */ #define PRU_ATTACH 0 /* attach protocol to up */ #define PRU_DETACH 1 /* detach protocol from up */ #define PRU_BIND 2 /* bind socket to address */ #define PRU_LISTEN 3 /* listen for connection */ #define PRU_CONNECT 4 /* establish connection to peer */ #define PRU_ACCEPT 5 /* accept connection from peer */ #define PRU_DISCONNECT 6 /* disconnect from peer */ #define PRU_SHUTDOWN 7 /* won't send any more data */ #define PRU_RCVD 8 /* have taken data; more room now */ #define PRU_SEND 9 /* send this data */ #define PRU_ABORT 10 /* abort (fast DISCONNECT, DETATCH) */ #define PRU_CONTROL 11 /* control operations on protocol */ #define PRU_SENSE 12 /* return status into m */ #define PRU_RCVOOB 13 /* retrieve out of band data */ #define PRU_SENDOOB 14 /* send out of band data */ #define PRU_SOCKADDR 15 /* fetch socket's address */ #define PRU_PEERADDR 16 /* fetch peer's address */ #define PRU_CONNECT2 17 /* connect two sockets */ /* begin for protocols internal use */ #define PRU_FASTTIMO 18 /* 200ms timeout */ #define PRU_SLOWTIMO 19 /* 500ms timeout */ #define PRU_PROTORCV 20 /* receive from below */ #define PRU_PROTOSEND 21 /* send to below */ /* end for protocol's internal use */ #define PRU_SEND_EOF 22 /* send and close */ #define PRU_SOSETLABEL 23 /* MAC label change */ #define PRU_CLOSE 24 /* socket close */ #define PRU_FLUSH 25 /* flush the socket */ #define PRU_NREQ 25 #ifdef PRUREQUESTS const char *prurequests[] = { "ATTACH", "DETACH", "BIND", "LISTEN", "CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN", "RCVD", "SEND", "ABORT", "CONTROL", "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", "PROTORCV", "PROTOSEND", "SEND_EOF", "SOSETLABEL", "CLOSE", "FLUSH", }; #endif #ifdef _KERNEL /* users shouldn't see this decl */ struct ifnet; struct stat; struct ucred; struct uio; /* * If the ordering here looks odd, that's because it's alphabetical. These * should eventually be merged back into struct protosw. * * Some fields initialized to defaults if they are NULL. - * See uipc_domain.c:net_init_domain() */ struct pr_usrreqs { void (*pru_abort)(struct socket *so); int (*pru_accept)(struct socket *so, struct sockaddr **nam); int (*pru_attach)(struct socket *so, int proto, struct thread *td); int (*pru_bind)(struct socket *so, struct sockaddr *nam, struct thread *td); int (*pru_connect)(struct socket *so, struct sockaddr *nam, struct thread *td); int (*pru_connect2)(struct socket *so1, struct socket *so2); int (*pru_control)(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td); void (*pru_detach)(struct socket *so); int (*pru_disconnect)(struct socket *so); int (*pru_listen)(struct socket *so, int backlog, struct thread *td); int (*pru_peeraddr)(struct socket *so, struct sockaddr **nam); int (*pru_rcvd)(struct socket *so, int flags); int (*pru_rcvoob)(struct socket *so, struct mbuf *m, int flags); int (*pru_send)(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td); #define PRUS_OOB 0x1 #define PRUS_EOF 0x2 #define PRUS_MORETOCOME 0x4 #define PRUS_NOTREADY 0x8 #define PRUS_IPV6 0x10 int (*pru_ready)(struct socket *so, struct mbuf *m, int count); int (*pru_sense)(struct socket *so, struct stat *sb); int (*pru_shutdown)(struct socket *so); int (*pru_flush)(struct socket *so, int direction); int (*pru_sockaddr)(struct socket *so, struct sockaddr **nam); int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int (*pru_sopoll)(struct socket *so, int events, struct ucred *cred, struct thread *td); void (*pru_sosetlabel)(struct socket *so); void (*pru_close)(struct socket *so); int (*pru_bindat)(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int (*pru_connectat)(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int (*pru_aio_queue)(struct socket *so, struct kaiocb *job); }; /* * All nonvoid pru_*() functions below return EOPNOTSUPP. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam); int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job); int pru_attach_notsupp(struct socket *so, int proto, struct thread *td); int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td); int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int pru_connect2_notsupp(struct socket *so1, struct socket *so2); int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td); int pru_disconnect_notsupp(struct socket *so); int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td); int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam); int pru_rcvd_notsupp(struct socket *so, int flags); int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags); int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td); int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count); int pru_sense_null(struct socket *so, struct stat *sb); int pru_shutdown_notsupp(struct socket *so); int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam); int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td); #endif /* _KERNEL */ /* * The arguments to the ctlinput routine are * (*protosw[].pr_ctlinput)(cmd, sa, arg); * where cmd is one of the commands below, sa is a pointer to a sockaddr, * and arg is a `void *' argument used within a protocol family. */ #define PRC_IFDOWN 0 /* interface transition */ #define PRC_ROUTEDEAD 1 /* select new route if possible ??? */ #define PRC_IFUP 2 /* interface has come back up */ /* was PRC_QUENCH2 3 DEC congestion bit says slow down */ /* was PRC_QUENCH 4 Deprecated by RFC 6633 */ #define PRC_MSGSIZE 5 /* message size forced drop */ #define PRC_HOSTDEAD 6 /* host appears to be down */ #define PRC_HOSTUNREACH 7 /* deprecated (use PRC_UNREACH_HOST) */ #define PRC_UNREACH_NET 8 /* no route to network */ #define PRC_UNREACH_HOST 9 /* no route to host */ #define PRC_UNREACH_PROTOCOL 10 /* dst says bad protocol */ #define PRC_UNREACH_PORT 11 /* bad port # */ /* was PRC_UNREACH_NEEDFRAG 12 (use PRC_MSGSIZE) */ #define PRC_UNREACH_SRCFAIL 13 /* source route failed */ #define PRC_REDIRECT_NET 14 /* net routing redirect */ #define PRC_REDIRECT_HOST 15 /* host routing redirect */ #define PRC_REDIRECT_TOSNET 16 /* redirect for type of service & net */ #define PRC_REDIRECT_TOSHOST 17 /* redirect for tos & host */ #define PRC_TIMXCEED_INTRANS 18 /* packet lifetime expired in transit */ #define PRC_TIMXCEED_REASS 19 /* lifetime expired on reass q */ #define PRC_PARAMPROB 20 /* header incorrect */ #define PRC_UNREACH_ADMIN_PROHIB 21 /* packet administrativly prohibited */ #define PRC_NCMDS 22 #define PRC_IS_REDIRECT(cmd) \ ((cmd) >= PRC_REDIRECT_NET && (cmd) <= PRC_REDIRECT_TOSHOST) #ifdef PRCREQUESTS char *prcrequests[] = { "IFDOWN", "ROUTEDEAD", "IFUP", "DEC-BIT-QUENCH2", "QUENCH", "MSGSIZE", "HOSTDEAD", "#7", "NET-UNREACH", "HOST-UNREACH", "PROTO-UNREACH", "PORT-UNREACH", "#12", "SRCFAIL-UNREACH", "NET-REDIRECT", "HOST-REDIRECT", "TOSNET-REDIRECT", "TOSHOST-REDIRECT", "TX-INTRANS", "TX-REASS", "PARAMPROB", "ADMIN-UNREACH" }; #endif /* * The arguments to ctloutput are: * (*protosw[].pr_ctloutput)(req, so, level, optname, optval, p); * req is one of the actions listed below, so is a (struct socket *), * level is an indication of which protocol layer the option is intended. * optname is a protocol dependent socket option request, * optval is a pointer to a mbuf-chain pointer, for value-return results. * The protocol is responsible for disposal of the mbuf chain *optval * if supplied, * the caller is responsible for any space held by *optval, when returned. * A non-zero return from ctloutput gives an * UNIX error number which should be passed to higher level software. */ #define PRCO_GETOPT 0 #define PRCO_SETOPT 1 #define PRCO_NCMDS 2 #ifdef PRCOREQUESTS char *prcorequests[] = { "GETOPT", "SETOPT", }; #endif #ifdef _KERNEL void pfctlinput(int, struct sockaddr *); struct domain *pffinddomain(int family); struct protosw *pffindproto(int family, int protocol, int type); struct protosw *pffindtype(int family, int type); int pf_proto_register(int family, struct protosw *npr); int pf_proto_unregister(int family, int protocol, int type); #endif #endif